0. Loading the data

The data was exported from an Excel file, in Open Office, with "Save As"->"Text CSV", as "UTF-8", "{Tabulator}" as field seperator and double quotes (") as text seperator.

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import helpers.diana

tier_numbers = {
    "clause_id": 2,
    "clause_type": 3,
    "grammatical_relation": 4,
    "pos_agreement": 5,
    "last_line": 7
}
ag = helpers.diana.from_excel("data/Hinuq3.csv", tier_numbers=tier_numbers)

1. Linear order

In [3]:
import collections

verbs = [ 'COP', 'SAY', 'v.tr', 'v.intr', 'v.aff' ]
verb_map = { v: "V" for v in verbs}
others = [ 'A', 'S', 'P', 'EXP', 'STIM', 'zero-A', 'zero-S', 'zero-P', 'zero-EXP', 'zero-STIM' ]
search_terms = verbs + others

word_orders = collections.defaultdict(int)
word_orders_ids = collections.defaultdict(list)

for wo in helpers.diana.word_orders(ag, search_terms):
    word_orders[tuple(wo.word_order)] += 1
    word_orders_ids[tuple(wo.word_order)].append(wo.clause_id)

for word_order, count in word_orders.items():
    print("{0} => {1}".format(word_order, count))
    if count < 5:
        print("    {1}".format(word_order, word_orders_ids[word_order]))
('zero-EXP', 'v.aff', 'STIM') => 8
('zero-A', 'v.tr', 'P') => 42
('COP', 'S') => 33
('zero-EXP', 'STIM', 'v.aff') => 26
('EXP', 'v.aff', 'zero-STIM') => 1
    ['clause_id..n#1024']
('S', 'COP') => 94
('v.tr', 'P', 'A') => 3
    ['clause_id..n#747', 'clause_id..n#798', 'clause_id..n#772']
('zero-S', 'v.intr') => 273
('STIM', 'v.aff', 'EXP') => 7
('v.intr', 'S') => 142
('A', 'SAY') => 88
('A', 'zero-P', 'v.tr') => 24
('STIM', 'EXP', 'v.aff') => 6
('v.aff', 'EXP', 'STIM') => 10
('zero-S', 'COP') => 2
    ['clause_id..n#786', 'clause_id..n#1087']
('A', 'v.tr', 'P') => 38
('COP', 'zero-S') => 1
    ['clause_id..n#187']
('v.aff', 'STIM', 'EXP') => 1
    ['clause_id..n#1603']
('EXP', 'STIM', 'v.aff') => 28
('zero-STIM', 'v.aff', 'EXP') => 4
    ['clause_id..n#41', 'clause_id..n#872', 'clause_id..n#1225', 'clause_id..n#1663']
('P', 'A', 'v.tr') => 25
('v.tr', 'A', 'P') => 10
('zero-A', 'P', 'v.tr') => 253
('A', 'v.tr', 'zero-P') => 1
    ['clause_id..n#1278']
('S', 'v.intr') => 406
('zero-A', 'zero-P', 'v.tr') => 94
('zero-P', 'A', 'v.tr') => 4
    ['clause_id..n#957', 'clause_id..n#1472', 'clause_id..n#1526', 'clause_id..n#1518']
('COP',) => 1
    ['clause_id..n#478']
('zero-EXP', 'zero-STIM', 'v.aff') => 15
('P', 'v.tr', 'A') => 34
('A', 'P', 'v.tr') => 112
('EXP', 'v.aff', 'STIM') => 19
('zero-A', 'SAY') => 22
('zero-P', 'v.tr', 'A') => 4
    ['clause_id..n#1465', 'clause_id..n#265', 'clause_id..n#948', 'clause_id..n#1375']
('EXP', 'zero-STIM', 'v.aff') => 9
('SAY', 'A') => 31

1.1. Are subordinate clauses significantly more often verb-final than main clauses?

In [4]:
word_orders_main = []
word_orders_main_count = collections.defaultdict(int)
word_orders_sub = []
word_orders_sub_count = collections.defaultdict(int)
main_clause_types = [ "m", "m.rs" ]
sub_clause_types = [ "sub", "sub.rs" ]
clause_types = main_clause_types + sub_clause_types
search_terms = verbs + ['A', 'S', 'P', 'EXP', 'STIM']

for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
    if "V" in wo.word_order and wo.clause_type in clause_types and len(wo.word_order) > 1:
        if wo.clause_type in sub_clause_types:
            word_orders_sub.append(wo.word_order)
            word_orders_sub_count[tuple(wo.word_order)] += 1
        else:
            word_orders_main.append(wo.word_order)
            word_orders_main_count[tuple(wo.word_order)] += 1 

Statistical test

Hypothesis H0: It does not depend on the clause type (sub vs. main) whether the clause unit is verb final.

In [5]:
main_v_fin = 0; main_v_nonfin = 0; sub_v_fin = 0; sub_v_nonfin = 0;
for wo, c in word_orders_main_count.items():
    if wo[-1] == "V":
        main_v_fin += c
    else:
        main_v_nonfin += c
for wo, c in word_orders_sub_count.items():
    if wo[-1] == "V":
        sub_v_fin += c
    else:
        sub_v_nonfin += c
cont_table = [ [main_v_fin, main_v_nonfin], [sub_v_fin, sub_v_nonfin] ]
cont_table
Out[5]:
[[734, 346], [343, 38]]
In [6]:
import scipy.stats
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[6]:
3.7177758031965255e-19

We reject the null hypothesis, as the chances of getting a distribution as the observed one are p < 0.05. The clause type affects the verb "finalness". In this case subordinate clauses have a sifgnificant higher count of verb final word orders.

1.2. How differ main clauses and subordinate clauses in their word orders?

Here are are the basic counts:

In [7]:
print("Counts for main clauses:")
for wo, c in word_orders_main_count.items():
    print("{0} => {1}".format(wo, c))
print("\nCounts for sub clauses:")
for wo, c in word_orders_sub_count.items():
    print("{0} => {1}".format(wo, c))
Counts for main clauses:
('S', 'V') => 377
('V', 'STIM', 'EXP') => 1
('V', 'S') => 163
('A', 'P', 'V') => 88
('STIM', 'V', 'EXP') => 4
('V', 'STIM') => 8
('EXP', 'STIM', 'V') => 26
('P', 'A', 'V') => 21
('EXP', 'V', 'STIM') => 19
('V', 'EXP') => 4
('A', 'V') => 106
('STIM', 'V') => 16
('V', 'P') => 24
('A', 'V', 'P') => 35
('V', 'EXP', 'STIM') => 10
('V', 'A') => 35
('EXP', 'V') => 7
('P', 'V') => 87
('V', 'P', 'A') => 3
('P', 'V', 'A') => 30
('V', 'A', 'P') => 10
('STIM', 'EXP', 'V') => 6

Counts for sub clauses:
('S', 'V') => 123
('STIM', 'V') => 10
('V', 'S') => 10
('P', 'V', 'A') => 4
('STIM', 'V', 'EXP') => 3
('EXP', 'STIM', 'V') => 2
('P', 'A', 'V') => 4
('EXP', 'V') => 3
('A', 'V', 'P') => 3
('P', 'V') => 166
('A', 'V') => 11
('A', 'P', 'V') => 24
('V', 'P') => 18

1.3. Where are G, BEN, G, TIME, LOC, ADD usually positioned? (e.g. before or after the verb)

In [8]:
particles = [ 'G', 'BEN', 'TIME', 'LOC', 'ADD' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles

for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
    for i, p in enumerate(particles):
        if "V" in wo.word_order and p in wo.word_order:
            if wo.word_order.index("V") < wo.word_order.index(p):
                pos_counts[i][0] += 1
            else:
                pos_counts[i][1] += 1
                
for i, p in enumerate(particles):
    print(p)
    print("    Count after verb:  {0}".format(pos_counts[i][0]))
    print("    Count before verb: {0}".format(pos_counts[i][1]))
G
    Count after verb:  111
    Count before verb: 252
BEN
    Count after verb:  17
    Count before verb: 68
TIME
    Count after verb:  9
    Count before verb: 99
LOC
    Count after verb:  28
    Count before verb: 122
ADD
    Count after verb:  29
    Count before verb: 33

1.4. In main clauses: are BEN arguments significantly more often positioned before the verb than after the verb?

In [9]:
particles = [ 'BEN', 'G', 'ADD' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles

before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
    for i, p in enumerate(particles):
        if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order:
            if wo.word_order.index("V") < wo.word_order.index(p):
                pos_counts[i][0] += 1
            else:
                pos_counts[i][1] += 1

for i, p in enumerate(particles):
    print(p)
    print("    Count after verb:  {0}".format(pos_counts[i][0]))
    print("    Count before verb: {0}".format(pos_counts[i][1]))
BEN
    Count after verb:  14
    Count before verb: 54
G
    Count after verb:  103
    Count before verb: 151
ADD
    Count after verb:  28
    Count before verb: 29
In [10]:
part_sum = [ 0, 0 ]
for i, p in enumerate(particles):
    part_sum[0] += pos_counts[i][0]
    part_sum[1] += pos_counts[i][1]
    print("Test for '{0}'".format(p))
    print(scipy.stats.binom_test(pos_counts[i]))
print("Test for 'BEN+G+ADD'")
print(scipy.stats.binom_test(part_sum))
Test for 'BEN'
1.10972567279e-06
Test for 'G'
0.00311070663735
Test for 'ADD'
1.0
Test for 'BEN+G+ADD'
5.62656137825e-06

Except for "ADD" it is very unlikely that those counts are random. So the difference for "BEN" and "G" and "BEN+G+ADD" is significant.

1.5. In main and sub clauses: Are BEN + G + ADD significantly more often occurring after the verb than A or P or LOC, A + P or A + P + LOC?

In [11]:
particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles

before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
    for i, p in enumerate(particles):
        if "V" in wo.word_order and p in wo.word_order:
            if wo.word_order.index("V") < wo.word_order.index(p):
                pos_counts[i][0] += 1
            else:
                pos_counts[i][1] += 1

Hypothesis H0: It does not depend on the grammatical relation type if a participant appears before or after the verb.

For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).

BEN + G + ADD vs. A

In [12]:
BEN_G_ADD = [ 0, 0 ]
BEN_G_ADD[0] = pos_counts[0][0] + pos_counts[1][0] + pos_counts[2][0]
BEN_G_ADD[1] = pos_counts[0][1] + pos_counts[1][1] + pos_counts[2][1]
cont_table = [ BEN_G_ADD, pos_counts[3] ]
cont_table
Out[12]:
[[157, 353], [82, 292]]
In [13]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[13]:
0.0035761096863860167

We reject the null hpythesis H0 because p < 0.05. The grammatical relation type does have influence on whether a participant appears before or after the verb. In this case, A appears significantly more often before the verb then BEN + G + ADD.

BEN + G + ADD vs. P

In [14]:
cont_table = [ BEN_G_ADD, pos_counts[4] ]
cont_table
Out[14]:
[[157, 353], [93, 424]]
In [15]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[15]:
2.0804748208486202e-06

Again, we reject the null hypothesis because p < 0.05. P occurs more often before the verb then BEN + G + ADD.

BEN + G + ADD vs. LOC

In [16]:
cont_table = [ BEN_G_ADD, pos_counts[5] ]
cont_table
Out[16]:
[[157, 353], [28, 122]]
In [17]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[17]:
0.0036929308655807127

Again, we reject the null hypothesis because p < 0.05. LOC occurs more often before the verb then BEN + G + ADD.

BEN + G + ADD vs. A + P

In [18]:
A_P = [ 0, 0 ]
A_P[0] = pos_counts[3][0] + pos_counts[4][0]
A_P[1] = pos_counts[3][1] + pos_counts[4][1]
cont_table = [ BEN_G_ADD, A_P ]
cont_table
Out[18]:
[[157, 353], [175, 716]]
In [19]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[19]:
3.3374996181763293e-06

Again, we reject the null hypothesis because p < 0.05. A + P occurs more often before the verb then BEN + G + ADD.

BEN + G + ADD vs. A + P + LOC

In [20]:
A_P_LOC = [ 0, 0 ]
A_P_LOC[0] = pos_counts[3][0] + pos_counts[4][0] + pos_counts[5][0]
A_P_LOC[1] = pos_counts[3][1] + pos_counts[4][1] + pos_counts[5][1]
cont_table = [ BEN_G_ADD, A_P_LOC ]
cont_table
Out[20]:
[[157, 353], [203, 838]]
In [21]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[21]:
1.4327318331957229e-06

Again, we reject the null hypothesis because p < 0.05. A + P + LOC occurs more often before the verb then BEN + G + ADD.

1.5.1. In main clauses only: BEN vs. P, ADD vs. P, BEN + ADD vs. P, A vs. P, A vs. S

In [22]:
particles = [ 'BEN', 'G', 'ADD', 'A', 'P', 'LOC', 'S' ]
pos_counts = [ [0, 0] for _ in particles ]
search_terms = verbs + particles

before = 0
after = 0
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
    for i, p in enumerate(particles):
        if wo.clause_type in main_clause_types and "V" in wo.word_order and p in wo.word_order:
            if wo.word_order.index("V") < wo.word_order.index(p):
                pos_counts[i][0] += 1
            else:
                pos_counts[i][1] += 1

BEN vs. P

In [23]:
cont_table = [ pos_counts[0], pos_counts[4] ]
cont_table
Out[23]:
[[14, 54], [72, 226]]
In [24]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[24]:
0.63498783480640697

We cannot reject H0. There is no significant difference between BEN and P if they appear before or after the verb.

ADD vs. P

In [25]:
cont_table = [ pos_counts[2], pos_counts[4] ]
cont_table
Out[25]:
[[28, 29], [72, 226]]
In [26]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[26]:
0.00031708160303060872

We can reject H0. P occurs more often before the verb then ADD.

BEN + ADD vs. P

In [27]:
BEN_ADD = [ 0, 0 ]
BEN_ADD[0] = pos_counts[0][0] + pos_counts[2][0]
BEN_ADD[1] = pos_counts[0][1] + pos_counts[2][1]
cont_table = [ BEN_ADD, pos_counts[4] ]
cont_table
Out[27]:
[[42, 83], [72, 226]]
In [28]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[28]:
0.054556656373823037

We cannot reject H0. There is no significant difference between BEN + ADD vs. P if they appear before or after the verb. Although the evidence is on the edge, so with more data it might be significant.

A vs. P

In [29]:
cont_table = [ pos_counts[3], pos_counts[4] ]
cont_table
Out[29]:
[[78, 250], [72, 226]]
In [30]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[30]:
0.92553990155932386

We cannot reject H0. There is no significant difference between A and P if they appear before or after the verb.

A vs. S

In [31]:
cont_table = [ pos_counts[3], pos_counts[5] ]
cont_table
Out[31]:
[[78, 250], [27, 97]]
In [32]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[32]:
0.70898128553715467

We cannot reject H0. There is no significant difference between A and S if they appear before or after the verb.

1.6. SAY: Does SAY have more often an overtly expressed A than all other transitive verbs?

Hypothesis 1 (H0): It does not depend on the type of the verb (SAY vs. others) if the A is expressed ouvertly.

For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).

In [33]:
other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ]
search_terms = other_verbs + [ 'SAY', 'A', 'zero-A' ]
verb_map = { v: "V" for v in other_verbs }
SAY_counts = [ 0, 0 ]
others_counts = [ 0, 0 ]
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
    if 'SAY' in wo.word_order:
        if 'A' in wo.word_order:
            SAY_counts[0] += 1
        elif 'zero-A' in wo.word_order:
            SAY_counts[1] += 1
    if 'V' in wo.word_order:
        if 'A' in wo.word_order:
            others_counts[0] += 1
        elif 'zero-A' in wo.word_order:
            others_counts[1] += 1
In [34]:
cont_table = [ SAY_counts, others_counts ]
cont_table
Out[34]:
[[119, 22], [255, 389]]
In [35]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[35]:
3.5525785729571192e-23

We reject the null hypothesis because p < 0.05. A is more often ouvert in SAY sentences than in any other sentence with other verb types.

1.7. Does the A of say more often follow its verb than the A of all other transitive verbs?

H0: It does not depend on the verb type (A vs. others) if A is before or after the verb.

In [36]:
search_terms = other_verbs + [ 'SAY', 'A' ]
SAY_counts = [ 0, 0 ]
others_counts = [ 0, 0 ]
for wo in helpers.diana.word_orders(ag, search_terms, verb_map):
    if 'SAY' in wo.word_order and 'A' in wo.word_order:
        if wo.word_order.index("SAY") < wo.word_order.index("A"):
            SAY_counts[0] += 1
        else:
            SAY_counts[1] += 1
    if "V" in wo.word_order and "A" in wo.word_order:
        if wo.word_order.index("V") < wo.word_order.index("A"):
            others_counts[0] += 1
        else:
            others_counts[1] += 1
In [37]:
cont_table = [ SAY_counts, others_counts ]
cont_table
Out[37]:
[[31, 88], [51, 204]]
In [38]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[38]:
0.22697216348643534

We cannot reject the null hypothesis as p > 0.05. There is no significant difference in SAY vs. other verb sentences between A before and after the verb.

1.8. Does the overt A of SAY more often precede or follow the verb (and what are the frequencies)?

In [39]:
print("A after SAY: {0}".format(SAY_counts[0]))
print("A before SAY: {0}".format(SAY_counts[1]))
A after SAY: 31
A before SAY: 88

2. Agreement

2.1. How frequent is agreement in texts (in main and subordinate clauses)?

In [40]:
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
agreements = [0, 0]; noagreements = [0, 0];
for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True):
    for agr in wo.agreement:
        if wo.clause_type in sub_clause_types:
            if agr == "noagr":
                noagreements[0] += 1
            else:
                agreements[0] += 1
        else:
            if agr == "noagr":
                noagreements[1] += 1
            else:
                agreements[1] += 1

print("I found {0} verbs with and {1} verbs without agreement.".format(agreements[0]+agreements[1], noagreements[0]+noagreements[1]))
print("In main clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[1], noagreements[1]))
print("In sub clauses: I found {0} verbs with and {1} verbs without agreement.".format(agreements[0], noagreements[0]))
I found 1253 verbs with and 346 verbs without agreement.
In main clauses: I found 838 verbs with and 222 verbs without agreement.
In sub clauses: I found 415 verbs with and 124 verbs without agreement.

2.2. Do non-agreeing verbs occur more often with overtly expressed S or P or STIM arguments than agreeing verbs (because the agreement prefixes are enough to track the reference)?

Hypothesis H0: It does not depend on the verbal agreement if S (or P or STIM) arguments are expressed overtly.

In [41]:
search_terms = verbs + [ 'S', 'P', 'STIM',  'zero-S', 'zero-P', 'zero-STIM' ]
cont_table_S = [ [0, 0], [0, 0] ]
cont_table_P = [ [0, 0], [0, 0] ]
cont_table_STIM = [ [0, 0], [0, 0] ]
cont_table_S_P_STIM =  [ [0, 0], [0, 0] ]
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
    agreeing = 0
    if len(wo.word_order) != len(wo.agreement):
        continue
    for i, w in enumerate(wo.word_order):
        if w in verbs:
            if wo.agreement[i] == "noagr":
                agreeing = 1
    if "zero-S" in wo.word_order:
        cont_table_S[agreeing][0] += 1
        cont_table_S_P_STIM[agreeing][0] += 1
    elif "S" in wo.word_order:
        cont_table_S[agreeing][1] += 1
        cont_table_S_P_STIM[agreeing][1] += 1
    if "zero-P" in wo.word_order:
        cont_table_P[agreeing][0] += 1
        cont_table_S_P_STIM[agreeing][0] += 1
    elif "P" in wo.word_order:
        cont_table_P[agreeing][1] += 1
        cont_table_S_P_STIM[agreeing][1] += 1
    if "zero-STIM" in wo.word_order:
        cont_table_STIM[agreeing][0] += 1
        cont_table_S_P_STIM[agreeing][0] += 1
    elif "STIM" in wo.word_order:
        cont_table_STIM[agreeing][1] += 1
        cont_table_S_P_STIM[agreeing][1] += 1

Statistical test for S

In [42]:
cont_table_S
Out[42]:
[[237, 603], [39, 72]]
In [43]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S)
pvalue
Out[43]:
0.14782310141174806

We cannot reject the null hypothesis, because p > 0.05. It does not depend on the verbal agreement whether S is expressed overtly.

Statistical test for P

In [44]:
cont_table_P
Out[44]:
[[70, 344], [57, 173]]
In [45]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_P)
pvalue
Out[45]:
0.017638070366621489

We can reject the hypothesis as p < 0.05. It does depend on verbal agreement whether P is expressed overtly. P is more often expressed overtly when there is agreement on the verb.

Staistical test for STIM

In [46]:
cont_table_STIM
Out[46]:
[[28, 101], [1, 4]]
In [47]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_STIM)
pvalue
Out[47]:
1.0

Here it is not possible to calculate, as the number of non-agreements is too low. Practically all verbs that have a STIM argument show agreement, whether the argument is expressed or not.

Statistical test for S + P + STIM

In [48]:
cont_table_S_P_STIM
Out[48]:
[[335, 1048], [97, 249]]
In [49]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table_S_P_STIM)
pvalue
Out[49]:
0.14533478714180809

We cannot reject the hypothesis as p > 0.05. It does not depend on the verbal agreement whether S + P + STIM is expressed overtly.

2.3. Differences between the prefixes or not?

2.3.1. How often do the prefixes occur at all in texts

In [75]:
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
agreement_sum = collections.defaultdict(int)
for wo in helpers.diana.word_orders(ag, verbs, with_agreement = True):
    for agr in wo.agreement:
        agreement_sum[agr] += 1
for agr, count in agreement_sum.items():
    print("{} => {}".format(agr, count))
r-5 => 200
o-1 => 354
noagr => 346
y-2 => 161
r-nhpl => 51
r-hpl => 2
b-hpl => 116
b-3 => 316
y-4 => 53

2.3.2. How often do the prefixes occur with overt arguments

We check here first, if the agreement makes sense, i.e. whether the class marker on P, S or STIM is the same as on the verb. All other cases are printed with clause ID, to check manually.

In [51]:
verbs = [ 'v.tr', 'v.intr', 'v.aff' ]
search_terms = verbs + [ 'S', 'P', 'STIM', 'zero-S', 'zero-P', 'zero-STIM' ]
agreements = collections.defaultdict(int)
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
    v_class = None; n_class = None; v_marker = None; n_type = None;
    agreement = False
    zero = False
    if len(wo.word_order) != len(wo.agreement):
        print("length on blue and yellow line different in ID {}".format(wo.clause_id))
        continue
    for i, w in enumerate(wo.word_order):
        if w in verbs:
            if wo.agreement[i] != "noagr":
                agreement = True
                if "-" in wo.agreement[i]:
                    v_marker, v_class = wo.agreement[i].split("-")
                else:
                    print("no dash in v agr in ID {}".format(wo.clause_id))
        else:
            if "-" in wo.agreement[i]:
                n_split = wo.agreement[i].split("-")
                if len(n_split) > 2:
                    print("more than one dash in n agr in ID {}".format(wo.clause_id))
                n_class = n_split[1]
                #n_type = n_split[1]
                if "." in n_class:
                    n_class, _ = n_class.split(".")
            else:
                print("no dash in n agr in ID {}".format(wo.clause_id))
            if w.startswith("zero-"):
                zero = True
                
    if v_class != n_class and agreement:
        print("n class does not equal v class in ID {} (n_class: {} vs. v_class: {})".format(wo.clause_id, n_class, v_class))
    elif v_class is not None and n_class is not None and not zero:
            agreements["{}-{}".format(v_marker, v_class)] += 1
n class does not equal v class in ID clause_id..n#1037 (n_class: imp vs. v_class: 5)
n class does not equal v class in ID clause_id..n#1552 (n_class: 2 vs. v_class: hpl)
n class does not equal v class in ID clause_id..n#1139 (n_class: 3 vs. v_class: 1)

Here are the counts for all overt arguments where class markers were equal:

In [52]:
for agr, count in agreements.items():
    print("{} => {}".format(agr, count))
r-5 => 167
o-1 => 222
y-2 => 119
r-nhpl => 46
r-hpl => 2
b-hpl => 64
b-3 => 257
y-4 => 42

2.3.3. Does the o-1 prefix more often occur with an ouvert argument than the b/r/y prefix?

H0: It does not depend on the prefix (o-1 vs. b) whether the argument is expressed ouvertly.

In [78]:
cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ],
            [ agreements["b-3"]+agreements["b-hpl"],
              agreement_sum["b-3"]+agreement_sum["b-hpl"]-agreements["b-3"]-agreements["b-hpl"] ] ]
cont_table
Out[78]:
[[222, 132], [321, 111]]
In [80]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[80]:
0.00062729277493344911

We reject H0 because p < 0.05. The "b" prefix occurs significantly more often with ouvert arguments then the "o-1" prefix.

H0: It does not depend on the prefix (o-1 vs. r) whether the argument is expressed ouvertly.

In [81]:
cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ],
            [ agreements["r-5"]+agreements["r-hpl"]+agreements["r-nhpl"],
              agreement_sum["r-5"]+agreement_sum["r-hpl"]+agreement_sum["r-nhpl"]-agreements["r-5"]-agreements["r-hpl"]-agreements["r-nhpl"] ] ]
cont_table
Out[81]:
[[222, 132], [215, 38]]
In [82]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[82]:
9.856708011476732e-10

We reject H0 because p < 0.05. The "r" prefix occurs significantly more often with ouvert arguments than the "o-1" prefix.

H0: It does not depend on the prefix (o-1 vs. y) whether the argument is expressed ouvertly.

In [83]:
cont_table = [ [ agreements["o-1"], agreement_sum["o-1"]-agreements["o-1"] ],
            [ agreements["y-2"]+agreements["y-4"],
              agreement_sum["y-2"]+agreement_sum["y-4"]-agreements["y-2"]-agreements["y-4"] ] ]
cont_table
Out[83]:
[[222, 132], [161, 53]]
In [84]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[84]:
0.0022627293600340314

We reject H0 because p < 0.05. The "y" prefix occurs significantly more often with ouvert arguments than the "o-1" prefix.

3. Referential density and distance

3.1. What is the referential density for COP, v.intr, v.tr and v.aff

COP

In [53]:
search_terms = [ "COP", "S" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
    if "COP" in wo.word_order:
        possible += 1
        if "S" in wo.word_order:
            overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
127 / 131 = 0.9694656488549618

v.intr

In [54]:
search_terms = [ "v.intr", "S" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
    if "v.intr" in wo.word_order:
        possible += 1
        if "S" in wo.word_order:
            overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
548 / 821 = 0.6674786845310596

v.tr

In [55]:
search_terms = [ "v.tr", "A", "P" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
    if "v.tr" in wo.word_order:
        possible += 2
        if "A" in wo.word_order:
            overt += 1
        if "P" in wo.word_order:
            overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
772 / 1288 = 0.5993788819875776

v.aff

In [56]:
search_terms = [ "v.aff", "EXP", "STIM" ]
possible = 0
overt = 0
for wo in helpers.diana.word_orders(ag, search_terms):
    if "v.aff" in wo.word_order:
        possible += 2
        if "EXP" in wo.word_order:
            overt += 1
        if "STIM" in wo.word_order:
            overt += 1
print("{} / {} = {}".format(overt, possible, float(overt)/possible))
190 / 268 = 0.7089552238805971

3.2. Is the referential density for agreeing X higher than for non-agreeing X?

v.intr

In [57]:
search_terms = [ "v.intr", "S" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
    if len(wo.word_order) != len(wo.agreement):
        continue
    if "v.intr" in wo.word_order:
        v_index = wo.word_order.index("v.intr")
        if wo.agreement[v_index] == "noagr":
            noagr_possible += 1
            if "S" in wo.word_order:
                noagr_overt += 1
        else:
            agr_possible += 1
            if "S" in wo.word_order:
                agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
with agreement: 476 / 710 = 0.6704225352112676
without agreement: 72 / 111 = 0.6486486486486487

Statistical test

Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?

In [58]:
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.665525212907

We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.

v.tr

In [59]:
search_terms = [ "v.tr", "A", "P" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
    if len(wo.word_order) != len(wo.agreement):
        continue
    if "v.tr" in wo.word_order:
        v_index = wo.word_order.index("v.tr")
        if wo.agreement[v_index] == "noagr":
            noagr_possible += 2
            if "A" in wo.word_order:
                noagr_overt += 1
            if "P" in wo.word_order:
                noagr_overt += 1
        else:
            agr_possible += 2
            if "A" in wo.word_order:
                agr_overt += 1
            if "P" in wo.word_order:
                agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
with agreement: 506 / 828 = 0.6111111111111112
without agreement: 266 / 460 = 0.5782608695652174

Statistical test

Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?

In [60]:
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.259811072564

We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.

v.aff

In [61]:
search_terms = [ "v.aff", "EXP", "STIM" ]
agr_possible = 0
agr_overt = 0
noagr_possible = 0
noagr_overt = 0
for wo in helpers.diana.word_orders(ag, search_terms, with_agreement = True):
    if len(wo.word_order) != len(wo.agreement):
        continue
    if "v.aff" in wo.word_order:
        v_index = wo.word_order.index("v.aff")
        if wo.agreement[v_index] == "noagr":
            noagr_possible += 2
            if "EXP" in wo.word_order:
                noagr_overt += 1
            if "STIM" in wo.word_order:
                noagr_overt += 1
        else:
            agr_possible += 2
            if "EXP" in wo.word_order:
                agr_overt += 1
            if "STIM" in wo.word_order:
                agr_overt += 1
print("with agreement: {} / {} = {}".format(agr_overt, agr_possible, float(agr_overt)/agr_possible))
print("without agreement: {} / {} = {}".format(noagr_overt, noagr_possible, float(noagr_overt)/noagr_possible))
with agreement: 185 / 258 = 0.7170542635658915
without agreement: 5 / 10 = 0.5

Statistical test

Hypthesis H0: It does not depend on agreement whether S in v.intr sentences is expressed overtly?

In [62]:
cont_table = [ [agr_overt, agr_possible-agr_overt], [noagr_overt, noagr_possible-noagr_overt] ]
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
print(pvalue)
0.160953027422

We cannot reject H0 as p > 0.05. It does not depent on agreement wether the argument is expressed overtly.

4. Arguments in "v.aff" vs. "v.tr" sentences: ouvert vs. zero

Counts

In [63]:
v_tree = { "v.aff": collections.defaultdict(int), "v.tr": collections.defaultdict(int) }
for wo in word_orders:
    v = None
    if "v.tr" in wo:
        v = "v.tr"
    if "v.aff" in wo:
        v = "v.aff"
    if v is not None:
        wo2 = tuple([e for e in sorted(wo) if e != "v.aff" and e != "v.tr"])
        v_tree[v][wo2] += word_orders[wo]

for v in ["v.aff", "v.tr"]:
    print(v)
    for e in v_tree[v]:
        print("{0} => {1}".format(e, v_tree[v][e]))
v.aff
('zero-EXP', 'zero-STIM') => 15
('STIM', 'zero-EXP') => 34
('EXP', 'zero-STIM') => 14
('EXP', 'STIM') => 71
v.tr
('A', 'zero-P') => 33
('A', 'P') => 222
('P', 'zero-A') => 295
('zero-A', 'zero-P') => 94

Statistical test for A/EXP

Hypothesis 1 (H0): It does not depend on the type of the verb ("v.tr" vs. "v.aff") if the A/EXP is expressed ouvertly.

For the test we use the Fisher exact test, as this test also works for small numbers (http://docs.scipy.org/doc/scipy-0.13.0/reference/generated/scipy.stats.fisher_exact.html).

In [64]:
import scipy.stats
cont_table = [
    [ v_tree["v.aff"][('EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'STIM')],
      v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('STIM', 'zero-EXP')] ],
    [ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('A', 'zero-P')], 
      v_tree["v.tr"][('P', 'zero-A')] + v_tree["v.tr"][('zero-A', 'zero-P')] ]
]
cont_table
Out[64]:
[[85, 49], [255, 389]]
In [65]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[65]:
5.1977253135270883e-07

We reject the null hypothesis, as the chances of getting a distribution as the observed one are p < 0.05. The verb type affects the ouvertness of the A/EXP argument. In this case the "v.aff" sentences have sigificantly more ouvert arguments EXP then the "v.tr" sentences have ouvert A.

Statistical test for P/STIM

Hypothesis 2 (H0): It does not depend on the type of the verb ("v.tr" vs. "v.aff") if the P/STIM is expressed ouvertly.

In [66]:
import scipy.stats
cont_table = [
    [ v_tree["v.aff"][('STIM', 'zero-EXP')] + v_tree["v.aff"][('EXP', 'STIM')],
      v_tree["v.aff"][('zero-EXP', 'zero-STIM')] + v_tree["v.aff"][('EXP', 'zero-STIM')] ],
    [ v_tree["v.tr"][('A', 'P')] + v_tree["v.tr"][('P', 'zero-A')], 
      v_tree["v.tr"][('A', 'zero-P')] + v_tree["v.tr"][('zero-A', 'zero-P')] ]
]
cont_table
Out[66]:
[[105, 29], [517, 127]]
In [67]:
oddsratio, pvalue = scipy.stats.fisher_exact(cont_table)
pvalue
Out[67]:
0.63554718109871811

In this case we cannot reject the null hypothesis, as p > 0.05. There is no statistical evidence that the verb type affects the ouvertness of P/STIM.

5. Positions of S, A and P

In [68]:
other_verbs = [ 'COP', 'v.tr', 'v.intr', 'v.aff' ]
verb_map = { v: "V" for v in other_verbs }

A_values = []
P_values = []
S_values = []
for wo in helpers.diana.word_orders(ag, annotation_map = verb_map):
    word_order = [w for w in wo.word_order if not w.startswith("zero-")]
    if "V" in word_order:
        v_index = word_order.index("V")
        if "A" in word_order:
            A_values.append(word_order.index("A") - v_index)
        if "P" in word_order:
            P_values.append(word_order.index("P") - v_index)
        if "S" in word_order:
            S_values.append(word_order.index("S") - v_index)
In [69]:
%matplotlib inline
import matplotlib.pyplot as plt
In [70]:
fig, axs = plt.subplots(1, 3, figsize=(14,4))
axs[0].hist(S_values, range(min(S_values), max(S_values)+2))
axs[0].set_title("Positions of S")
axs[1].hist(A_values, range(min(A_values), max(A_values)+2))
axs[1].set_title("Positions of A")
axs[2].hist(P_values, range(min(P_values), max(P_values)+2))
ret = axs[2].set_title("Positions of P")

Box plots of positions

In [71]:
plt.figure(figsize=(10,6))
plt.boxplot([S_values, A_values, P_values])
plt.title("Positions of S, A and P")
ret = plt.xticks([1, 2, 3], ["S", "A", "P"])

Plots by clause type

In [72]:
A_values = [[], []]
P_values = [[], []]
S_values = [[], []]
clause_types = ["m", "m.rs", "sub", "sub.rs"]
for wo in helpers.diana.word_orders(ag, annotation_map = verb_map):
    word_order = [w for w in wo.word_order if not w.startswith("zero-")]

    if "V" in word_order and wo.clause_type in clause_types:
        ind = 0
        if wo.clause_type == "sub" or wo.clause_type == "sub.rs":
            ind = 1
        v_index = word_order.index("V")
        if "A" in word_order:
            A_values[ind].append(word_order.index("A") - v_index)
        if "P" in word_order:
            P_values[ind].append(word_order.index("P") - v_index)
        if "S" in word_order:
            S_values[ind].append(word_order.index("S") - v_index)
In [73]:
fig, axs = plt.subplots(2, 3, figsize=(14,10))
for ind in [0, 1]:
    type_text = "main"
    if ind == 1:
        type_text = "sub"
    axs[ind][0].hist(S_values[ind], range(min(S_values[ind]), max(S_values[ind])+2))
    axs[ind][0].set_title("Positions of S in {0} clauses".format(type_text))
    axs[ind][1].hist(A_values[ind], range(min(A_values[ind]), max(A_values[ind])+2))
    axs[ind][1].set_title("Positions of A {0} clauses".format(type_text))
    axs[ind][2].hist(P_values[ind], range(min(P_values[ind]), max(P_values[ind])+2))
    ret = axs[ind][2].set_title("Positions of P {0} clauses".format(type_text))

Box plots by clause type

In [74]:
fig, axs = plt.subplots(1, 2, figsize=(14,6))
for ind in [0, 1]:
    type_text = "main"
    if ind == 1:
        type_text = "sub"

    axs[ind].boxplot([S_values[ind], A_values[ind], P_values[ind]])
    axs[ind].set_title("Positions of S, A and P in {0} clauses".format(type_text))
    ret = plt.xticks([1, 2, 3], ["S", "A", "P"])
In [ ]: