In [1]:

import functools
import hashlib
import io
import pathlib
import shutil
import urllib.request
import zipfile

URL = 'https://zenodo.org/record/841982/files/xflr6/portmanteaus-v1.0.zip'

CSV = pathlib.Path('esm3-analyses.csv')


def sha256sum(filename, bufsize=32768) -> str:
    s = hashlib.sha256()
    with io.open(filename, 'rb') as fd:
        for data in iter(functools.partial(fd.read, bufsize), b''):
            s.update(data)
    return s.hexdigest()


if not CSV.exists():
    with io.BytesIO() as b:
        with urllib.request.urlopen(URL) as u:
            shutil.copyfileobj(u, b)
        with zipfile.ZipFile(b) as z:
            i, = (i for i in z.infolist() if i.filename.endswith(CSV.name))
            i.filename = CSV.name
            z.extract(i)

sha256sum(CSV)

Out[1]:

'6e6389a913cc01020d03ac16217bc1c63c9d0e16b78179b4c931741c0d5a69cf'

In [2]:

%matplotlib inline

import types
import warnings

import matplotlib.pyplot as plt
import numpy as np
from numpy import count_nonzero as ntrue
import pandas as pd
import scipy.stats


def crosstab(df, index, columns, tests=False, margins=True, **kwargs):
    idx = [df[i] for i in index] if isinstance(index, list) else df[index]
    cols = [df[c] for c in columns] if isinstance(columns, list) else df[columns]
    result = pd.crosstab(idx, cols, margins=margins, **kwargs)
    if tests:
        result = types.SimpleNamespace(df=result)
        values = result.df.iloc[:-1, :-1] if margins else result.df
        testres = chi2_contingency(values), fisher_exact(values)
        label = tests if isinstance(tests, str) else f'{index} & {columns}'
        testres = [tr.to_frame(label).T for tr in testres]
        result.expected = pd.DataFrame(
            testres[0].at[label, 'expected'],
            index=values.index, columns=values.columns)
        result.chi2, result.fisher_exact = testres
    return result


def chi2_contingency(x, func=scipy.stats.chi2_contingency):
    try:
        chi2, p, dof, expected = func(x)
    except ValueError:
        return
    result = chi2, p, dof, expected, (expected >= 5).all()
    return pd.Series(result, index=('chi2', 'p', 'dof', 'expected', 'usable'))


def fisher_exact(x, func=scipy.stats.fisher_exact):
    return pd.Series(func(x), index=('odds', 'p'))


def pearsonr(df, left, right, func=scipy.stats.pearsonr):
    df = df[[left, right]].dropna()
    name = f'{left} & {right}'
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', scipy.stats.ConstantInputWarning)
        with np.errstate(invalid='ignore'):
            result = func(df[left], df[right])
    return pd.Series(result, index=('r', 'p'), name=name)


plt.style.use('classic')
plt.rcParams.update({'figure.figsize': (6, 4), 'figure.facecolor': 'w',
                     'figure.subplot.bottom': .125, 'font.size': 10, 'savefig.dpi': 72})

In [3]:

RENAME = {'Quechua (Ayacucho)': 'Ayacucho',
          'Tlachichilco Tepehuan': 'Tepehua',
          'Lakhota': 'Lakota'}


df = pd.read_csv(CSV, encoding='utf-8')

df['Language'] = df['Language'].replace(RENAME)

df.info()
assert df.set_index(['Language', 'Cell', 'Position']).index.is_unique
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2818 entries, 0 to 2817
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Language  2818 non-null   object
 1   Cell      2818 non-null   object
 2   Position  2818 non-null   int64 
 3   Form      2818 non-null   object
 4   Meaning   2818 non-null   object
dtypes: int64(1), object(4)
memory usage: 110.2+ KB

Out[3]:

	Language	Cell	Position	Form	Meaning
0	Ainu	1s	-1	ku	SA[+1 +sg]
1	Ainu	1p	1	as	S[+1 +pl]
2	Ainu	2s	-1	e	SAP[-3 +sg]
3	Ainu	2p	-1	eci	SAP[+2]
4	Ainu	x	1	an	S[-1 -2 -3]
5	Ainu	1s->2s	-1	eci	SAP[+2]
6	Ainu	1s->2p	-1	eci	SAP[+2]
7	Ainu	1s->3s	-1	ku	SA[+1 +sg]
8	Ainu	1s->3p	-1	ku	SA[+1 +sg]
9	Ainu	1s->x	-2	ku	SA[+1 +sg]

In [4]:

import itertools

SEP = '->'


def is_local(cellkey, sep=SEP, persons=('1', '2')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    return any(p in subj for p in persons) and any(p in obj for p in persons)


def is_inverse(cellkey, sep=SEP, hierarchy=('1', '2', '3', '4', 'x')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    for high, low in itertools.combinations(hierarchy, 2):
        if low in subj and high in obj:
            return True
    return False


def is_inverse_restricted(cellkey, sep=SEP, high=('1', '2'), low=('3', '4', 'x')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    return any(l in subj for l in low) and any(h in obj for h in high)


def is_inverse_relaxed(cellkey, sep=SEP, high=('1', '2')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    return any(h in obj for h in high)


def get_slot(cellpos) -> bool:
    return 'prefix' if cellpos < 0 else 'suffix' if cellpos > 0 else 'stem'


df.insert(2, 'Trans', df['Cell'].str.contains(SEP))
df.insert(3, 'Local', df['Cell'].map(is_local))
df.insert(4, 'Direct', ~df['Cell'].map(is_inverse))
df.insert(6, 'Slot', df['Position'].map(get_slot))
df.insert(9, 'Portmx', df['Meaning'].str.contains(SEP))

df.head(10)

Out[4]:

	Language	Cell	Trans	Local	Direct	Position	Slot	Form	Meaning	Portmx
0	Ainu	1s	False	False	True	-1	prefix	ku	SA[+1 +sg]	False
1	Ainu	1p	False	False	True	1	suffix	as	S[+1 +pl]	False
2	Ainu	2s	False	False	True	-1	prefix	e	SAP[-3 +sg]	False
3	Ainu	2p	False	False	True	-1	prefix	eci	SAP[+2]	False
4	Ainu	x	False	False	True	1	suffix	an	S[-1 -2 -3]	False
5	Ainu	1s->2s	True	True	True	-1	prefix	eci	SAP[+2]	False
6	Ainu	1s->2p	True	True	True	-1	prefix	eci	SAP[+2]	False
7	Ainu	1s->3s	True	False	True	-1	prefix	ku	SA[+1 +sg]	False
8	Ainu	1s->3p	True	False	True	-1	prefix	ku	SA[+1 +sg]	False
9	Ainu	1s->x	True	False	True	-2	prefix	ku	SA[+1 +sg]	False

In [5]:

df['Language'].nunique()

Out[5]:

In [6]:

df.pivot_table('Position', 'Trans', 'Local', aggfunc=len, fill_value=0, margins=True)

Out[6]:

Local	False	True	All
Trans
False	270	0	270
True	1963	585	2548
All	2233	585	2818

In [7]:

(df.groupby(['Language', 'Cell'])[['Trans', 'Local']].first()
 .pivot_table(index='Trans', columns='Local', aggfunc=len, fill_value=0, margins=True))

Out[7]:

Local	False	True	All
Trans
False	186	0	186
True	890	241	1131
All	1076	241	1317

In [8]:

df.groupby('Language')['Cell'].nunique().to_frame()

Out[8]:

	Cell
Language
Ainu	42
Aleut	72
Ayacucho	39
Bella Coola	34
Chuckchi	33
Darai	34
Fox	49
Hixkaryana	35
Jaqaru	14
Jumjum	44
Karuk	34
Ket	102
Kunama	86
Lakota	37
Maricopa	28
Maung	115
Mordvin	34
Nocte	34
Reyesano	33
Sahu	115
Siuslawan	85
Tepehua	37
Thangmi	33
Turkana	34
Wardaman	42
Yimas	72

In [9]:

(df.groupby(['Language', 'Cell'])['Position'].size()
 .groupby(level='Language').mean()
 .to_frame('marker/cell')
 .sort_values(by='marker/cell')
 .describe())

Out[9]:

	marker/cell
count	26.000000
mean	1.975917
std	0.659278
min	1.000000
25%	1.443182
50%	1.911855
75%	2.460084
max	3.361111

In [10]:

(df.groupby(['Language', 'Cell'])['Position'].size()
 .groupby(level='Language').mean()
 .std(ddof=0))

Out[10]:

0.6464749527797673

Learned lexemes¶

In [11]:

assert (df.groupby(['Language', 'Form', 'Meaning'])[['Slot', 'Portmx']]
        .filter(lambda g: (g.nunique() > 1).any())
        .empty)

(df.groupby(['Language', 'Form', 'Meaning'])[['Slot', 'Portmx']].first()
 .pivot_table(index='Slot', columns='Portmx', aggfunc=len, fill_value=0, margins=True))

Out[11]:

Portmx	False	True	All
Slot
prefix	132	50	182
suffix	119	55	174
All	251	105	356

In [12]:

(df.groupby(['Language', 'Form', 'Meaning'], as_index=False)['Portmx'].first()
 .pivot_table('Form', 'Language', 'Portmx', aggfunc=len, fill_value=0, margins=True))

Out[12]:

Portmx	False	True	All
Language
Ainu	9	1	10
Aleut	8	9	17
Ayacucho	6	2	8
Bella Coola	9	2	11
Chuckchi	10	5	15
Darai	9	2	11
Fox	13	4	17
Hixkaryana	7	2	9
Jaqaru	4	5	9
Jumjum	16	9	25
Karuk	10	3	13
Ket	12	0	12
Kunama	11	6	17
Lakota	7	1	8
Maricopa	3	0	3
Maung	19	16	35
Mordvin	10	8	18
Nocte	8	2	10
Reyesano	4	0	4
Sahu	19	0	19
Siuslawan	10	6	16
Tepehua	9	3	12
Thangmi	7	3	10
Turkana	6	1	7
Wardaman	7	5	12
Yimas	18	10	28
All	251	105	356

In [13]:

llf = (df.groupby(['Language', 'Form', 'Meaning', 'Portmx'], as_index=False)['Local']
       .agg(lambda x: True if x.all() else False if not x.any() else None)
       .query('Local == Local')
       .pivot_table('Form', ['Language', 'Local'], 'Portmx', aggfunc=len, fill_value=0, dropna=False))

(llf.join(llf.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))

Out[13]:

	Portmx	False	True	odds	p
Language	Local

In [14]:

dlf = (df.groupby(['Language', 'Form', 'Meaning', 'Portmx'], as_index=False)['Direct']
       .agg(lambda x: True if x.all() else False if not x.any() else None)
       .query('Direct == Direct')  # filter out Nan
       .pivot_table('Form', ['Language', 'Direct'], 'Portmx', aggfunc=len, fill_value=0, dropna=False))

(dlf.join(llf.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))

Out[14]:

	Portmx	False	True	odds	p
Language	Direct

Transitive occurrences¶

In [15]:

ct = crosstab(df[df['Trans']], 'Local', 'Portmx', tests=True)
ct.df

Out[15]:

Portmx	False	True	All
Local
False	1725	238	1963
True	528	57	585
All	2253	295	2548

In [16]:

ct.expected

Out[16]:

Portmx	False	True
Local
False	1735.729592	227.270408
True	517.270408	67.729592

In [17]:

ct.chi2.drop('expected', axis=1)

Out[17]:

	chi2	p	dof	usable
Local & Portmx	2.268065	0.132065	1	True

In [18]:

ct.fisher_exact

Out[18]:

	odds	p
Local & Portmx	0.782444	0.122239

In [19]:

of = (df[df['Trans']]
      .groupby(['Language', 'Local'])['Portmx']
      .agg([('Markers', 'size'), ('Portmx', ntrue)]))

of['Ratio'] = 100 * of['Portmx'] / of['Markers']
of['Ratio (norm)'] = of['Ratio'] / of['Ratio'].groupby(level='Language').sum()
of.loc[of['Ratio (norm)'].isnull(), 'Ratio'] = None

ofp = of.reset_index('Local')

of.unstack('Local')

Out[19]:

	Markers		Portmx		Ratio		Ratio (norm)
Local	False	True	False	True	False	True	False	True
Language
Ainu	41	12	2	0	4.878049	0.000000	1.000000	0.000000
Aleut	66	33	21	0	31.818182	0.000000	1.000000	0.000000
Ayacucho	62	22	4	4	6.451613	18.181818	0.261905	0.738095
Bella Coola	37	21	3	0	8.108108	0.000000	1.000000	0.000000
Chuckchi	29	12	5	4	17.241379	33.333333	0.340909	0.659091
Darai	53	20	4	0	7.547170	0.000000	1.000000	0.000000
Fox	115	24	14	4	12.173913	16.666667	0.422111	0.577889
Hixkaryana	22	6	4	0	18.181818	0.000000	1.000000	0.000000
Jaqaru	9	7	3	4	33.333333	57.142857	0.368421	0.631579
Jumjum	55	19	12	5	21.818182	26.315789	0.453280	0.546720
Karuk	25	11	4	1	16.000000	9.090909	0.637681	0.362319
Ket	196	22	0	0	NaN	NaN	NaN	NaN
Kunama	139	47	37	1	26.618705	2.127660	0.925985	0.074015
Lakota	42	20	0	2	0.000000	10.000000	0.000000	1.000000
Maricopa	16	16	0	0	NaN	NaN	NaN	NaN
Maung	255	18	69	5	27.058824	27.777778	0.493445	0.506555
Mordvin	31	9	10	1	32.258065	11.111111	0.743802	0.256198
Nocte	22	8	4	2	18.181818	25.000000	0.421053	0.578947
Reyesano	32	12	0	0	NaN	NaN	NaN	NaN
Sahu	195	24	0	0	NaN	NaN	NaN	NaN
Siuslawan	178	61	12	1	6.741573	1.639344	0.804396	0.195604
Tepehua	56	32	2	9	3.571429	28.125000	0.112676	0.887324
Thangmi	39	18	1	3	2.564103	16.666667	0.133333	0.866667
Turkana	28	10	4	0	14.285714	0.000000	1.000000	0.000000
Wardaman	67	25	8	4	11.940299	16.000000	0.427350	0.572650
Yimas	153	76	15	7	9.803922	9.210526	0.515604	0.484396

In [20]:

pd.DataFrame([pearsonr(ofp, 'Local', c) for c in ['Ratio', 'Ratio (norm)']])

Out[20]:

	r	p
Local & Ratio	-0.040995	0.791617
Local & Ratio (norm)	-0.277682	0.068004

In [21]:

ctl = (df[df['Trans']]
       .pivot_table('Position', ['Language', 'Local'], 'Portmx', aggfunc=len, fill_value=0))

(ctl.join(ctl.groupby(level='Language').apply(chi2_contingency))
 .rename_axis('Portmx', axis=1)
 .query('usable == 1 & p <= .05'))

Out[21]:

	Portmx	False	True	chi2	p	dof	expected	usable
Language	Local
Aleut	False	45	21	11.491071	0.000699	1.0	[[52.0, 14.0], [26.0, 7.0]]	True
Aleut	True	33	0	11.491071	0.000699	1.0	[[52.0, 14.0], [26.0, 7.0]]	True
Kunama	False	102	37	11.496919	0.000697	1.0	[[110.60215053763442, 28.397849462365592], [37...	True
Kunama	True	46	1	11.496919	0.000697	1.0	[[110.60215053763442, 28.397849462365592], [37...	True

In [22]:

(ctl.join(ctl.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))

Out[22]:

	Portmx	False	True	odds	p
Language	Local
Aleut	False	45	21	0.000000	0.000062
Aleut	True	33	0	0.000000	0.000062
Kunama	False	102	37	0.059929	0.000102
Kunama	True	46	1	0.059929	0.000102
Tepehua	False	54	2	10.565217	0.001467
Tepehua	True	23	9	10.565217	0.001467

Transitive cells¶

In [23]:

cells = (df[df['Trans']]
         .groupby(['Language', 'Cell', 'Local', 'Direct'], sort=False)['Portmx']
         .agg([('Markers', 'size'), ('Portmx', ntrue), ('Hasp', 'any')])
         .reset_index(['Local', 'Direct']))

cells['Ratio'] = 100 * cells['Portmx'] / cells['Markers']

cells

Out[23]:

		Local	Direct	Markers	Portmx	Hasp	Ratio
Language	Cell
Ainu	1s->2s	True	True	1	0	False	0.000000
	1s->2p	True	True	1	0	False	0.000000
	1s->3s	False	True	1	0	False	0.000000
	1s->3p	False	True	1	0	False	0.000000
	1s->x	False	True	2	0	False	0.000000
...	...	...	...	...	...	...	...
Yimas	3p->2d	False	False	4	0	False	0.000000
	3p->2p	False	False	3	0	False	0.000000
	3p->3s	False	True	4	1	True	25.000000
	3p->3d	False	True	4	1	True	25.000000
	3p->3p	False	True	3	1	True	33.333333

1131 rows × 6 columns

In [24]:

cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Ratio', 'Markers'))

Out[24]:

Ratio & Markers	r	p
Language
Ainu	-0.173851	0.303450
Aleut	0.035474	0.782539
Ayacucho	-0.216970	0.232945
Bella Coola	0.240767	0.217134
Chuckchi	-0.024049	0.905222
Darai	-0.036824	0.852418
Fox	-0.132580	0.414771
Hixkaryana	NaN	NaN
Jaqaru	-0.167380	0.643943
Jumjum	-0.192715	0.260133
Karuk	-0.034922	0.859967
Ket	NaN	NaN
Kunama	-0.175439	0.132188
Lakota	-0.230396	0.212423
Maricopa	NaN	NaN
Maung	-0.238243	0.014874
Mordvin	0.006371	0.974333
Nocte	-0.144841	0.462110
Reyesano	NaN	NaN
Sahu	NaN	NaN
Siuslawan	0.426728	0.000135
Tepehua	0.675099	0.000031
Thangmi	0.587354	0.001016
Turkana	-0.019087	0.923202
Wardaman	0.023560	0.893134
Yimas	0.411838	0.000798

In [25]:

ct = crosstab(cells, 'Local', 'Hasp', tests=True)

ct.df

Out[25]:

Hasp	False	True	All
Local
False	686	204	890
True	196	45	241
All	882	249	1131

In [26]:

ct.expected

Out[26]:

Hasp	False	True
Local
False	694.058355	195.941645
True	187.941645	53.058355

In [27]:

ct.chi2.drop('expected', axis=1)

Out[27]:

	chi2	p	dof	usable
Local & Hasp	1.754557	0.185305	1	True

In [28]:

ct.fisher_exact

Out[28]:

	odds	p
Local & Hasp	0.772059	0.162222

In [29]:

pd.DataFrame([pearsonr(cells, 'Local', c) for c in ['Hasp', 'Ratio']])

Out[29]:

	r	p
Local & Hasp	-0.041992	0.158161
Local & Ratio	-0.052016	0.080369

In [30]:

ctl = cells.pivot_table('Markers', ['Language', 'Local'], 'Hasp', aggfunc=len, fill_value=0)

(ctl.join(ctl.groupby(level='Language').apply(chi2_contingency))
 .rename_axis('Hasp', axis=1)
 .query('usable == 1 & p <= .05'))

Out[30]:

	Hasp	False	True	chi2	p	dof	expected	usable
Language	Local
Kunama	False	30	27	8.513763	0.003525	1.0	[[35.72, 21.28], [11.28, 6.72]]	True
Kunama	True	17	1	8.513763	0.003525	1.0	[[35.72, 21.28], [11.28, 6.72]]	True

In [31]:

(ctl.join(ctl.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Hasp', axis=1)
 .query('p <= .05'))

Out[31]:

	Hasp	False	True	odds	p
Language	Local
Aleut	False	29	16	0.000000	0.002852
Aleut	True	18	0	0.000000	0.002852
Kunama	False	30	27	0.065359	0.001533
Kunama	True	17	1	0.065359	0.001533
Tepehua	False	21	2	31.500000	0.000921
Tepehua	True	2	6	31.500000	0.000921

Cells with portmanteau vs. Occurrences vs. Mean cell ratio¶

In [32]:

cf = (cells.reset_index()
      .groupby(['Language', 'Local'])
      .agg({'Cell': 'size', 'Hasp': ntrue, 'Markers': 'sum',
            'Portmx': 'sum', 'Ratio': 'mean'})
      .rename(columns={'Cell': 'Cells', 'Ratio': 'Cell ratio'}))

cf.insert(2, 'Perc', 100 * cf['Hasp'] / cf['Cells'])
cf.insert(5, 'Ratio', 100 * cf['Portmx'] / cf['Markers'])
nf = cf[['Perc', 'Ratio', 'Cell ratio']]
nf /= nf.groupby(level='Language').sum()
cf = cf.join(nf, rsuffix=' (norm)')
cf.loc[cf['Perc (norm)'].isnull(), ['Perc', 'Ratio', 'Cell ratio']] = None

cfp = cf.reset_index('Local')

cf = cf.unstack('Local')

cf.loc[:, :'Cell ratio']

Out[32]:

	Cells		Hasp		Perc		Markers		Portmx		Ratio		Cell ratio
Local	False	True	False	True	False	True	False	True	False	True	False	True	False	True
Language
Ainu	29	8	2	0	6.896552	0.000000	41	12	2	0	4.878049	0.000000	6.896552	0.000000
Aleut	45	18	16	0	35.555556	0.000000	66	33	21	0	31.818182	0.000000	28.888889	0.000000
Ayacucho	24	8	4	4	16.666667	50.000000	62	22	4	4	6.451613	18.181818	4.513889	31.250000
Bella Coola	20	8	3	0	15.000000	0.000000	37	21	3	0	8.108108	0.000000	5.833333	0.000000
Chuckchi	19	8	5	4	26.315789	50.000000	29	12	5	4	17.241379	33.333333	21.052632	25.000000
Darai	20	8	4	0	20.000000	0.000000	53	20	4	0	7.547170	0.000000	7.916667	0.000000
Fox	32	8	14	4	43.750000	50.000000	115	24	14	4	12.173913	16.666667	12.552083	17.708333
Hixkaryana	22	6	4	0	18.181818	0.000000	22	6	4	0	18.181818	0.000000	18.181818	0.000000
Jaqaru	7	3	3	3	42.857143	100.000000	9	7	3	4	33.333333	57.142857	42.857143	55.555556
Jumjum	28	8	11	4	39.285714	50.000000	55	19	12	5	21.818182	26.315789	27.023810	34.375000
Karuk	20	8	4	1	20.000000	12.500000	25	11	4	1	16.000000	9.090909	17.500000	6.250000
Ket	84	8	0	0	NaN	NaN	196	22	0	0	NaN	NaN	NaN	NaN
Kunama	57	18	27	1	47.368421	5.555556	139	47	37	1	26.618705	2.127660	28.654971	1.851852
Lakota	23	8	0	2	0.000000	25.000000	42	20	0	2	0.000000	10.000000	0.000000	18.750000
Maricopa	16	8	0	0	NaN	NaN	16	16	0	0	NaN	NaN	NaN	NaN
Maung	96	8	58	3	60.416667	37.500000	255	18	69	5	27.058824	27.777778	29.774306	25.000000
Mordvin	20	8	9	1	45.000000	12.500000	31	9	10	1	32.258065	11.111111	33.333333	12.500000
Nocte	20	8	4	2	20.000000	25.000000	22	8	4	2	18.181818	25.000000	20.000000	25.000000
Reyesano	20	8	0	0	NaN	NaN	32	12	0	0	NaN	NaN	NaN	NaN
Sahu	96	8	0	0	NaN	NaN	195	24	0	0	NaN	NaN	NaN	NaN
Siuslawan	57	18	9	1	15.789474	5.555556	178	61	12	1	6.741573	1.639344	4.970760	1.111111
Tepehua	23	8	2	6	8.695652	75.000000	56	32	2	9	3.571429	28.125000	2.536232	24.375000
Thangmi	20	8	1	2	5.000000	25.000000	39	18	1	3	2.564103	16.666667	1.250000	9.375000
Turkana	20	8	4	0	20.000000	0.000000	28	10	4	0	14.285714	0.000000	15.000000	0.000000
Wardaman	27	8	8	3	29.629630	37.500000	67	25	8	4	11.940299	16.000000	12.037037	15.625000
Yimas	45	18	12	4	26.666667	22.222222	153	76	15	7	9.803922	9.210526	8.851852	5.707672

In [33]:

# Point-biserial correlation = Pearson r
pd.DataFrame([pearsonr(cfp, 'Local', c) for c in ['Perc', 'Ratio', 'Cell ratio']])

Out[33]:

	r	p
Local & Perc	0.021124	0.891740
Local & Ratio	-0.040995	0.791617
Local & Cell ratio	-0.069615	0.653413

In [34]:

cfp[['Perc', 'Ratio', 'Cell ratio']].corr()

Out[34]:

	Perc	Ratio	Cell ratio
Perc	1.000000	0.893704	0.873536
Ratio	0.893704	1.000000	0.956063
Cell ratio	0.873536	0.956063	1.000000

In [35]:

ax = (cf.sort_values(by=('Cell ratio (norm)', True), kind='mergesort')['Cell ratio']
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

for x, lang in [(1, 'Aleut'), (6, 'Kunama'), (20, 'Tepehua')]:
    ax.annotate('*', (x - .05, cf.loc[lang, 'Cell ratio'].sum() + .5))

#ax.get_figure().savefig('results_local.pdf', bbox_inches='tight')

In [36]:

(cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Local', 'Ratio'))['r']
 .sort_values(kind='mergesort')
 .plot.bar(figsize=(15, 5)));

In [37]:

ax = cf['Cell ratio'].dropna().T.plot(figsize=(8, 6), legend=False)
ax.legend(bbox_to_anchor=(1.35, 1.15));

In [38]:

cf['Cell ratio'].boxplot(return_type='axes');

In [39]:

fig, ax = plt.subplots()

ax.violinplot(cf['Cell ratio'].dropna().values, showmedians=True, bw_method=.25);

In [40]:

cf[['Markers', 'Portmx', 'Ratio (norm)', 'Cell ratio (norm)']]

Out[40]:

	Markers		Portmx		Ratio (norm)		Cell ratio (norm)
Local	False	True	False	True	False	True	False	True
Language
Ainu	41	12	2	0	1.000000	0.000000	1.000000	0.000000
Aleut	66	33	21	0	1.000000	0.000000	1.000000	0.000000
Ayacucho	62	22	4	4	0.261905	0.738095	0.126214	0.873786
Bella Coola	37	21	3	0	1.000000	0.000000	1.000000	0.000000
Chuckchi	29	12	5	4	0.340909	0.659091	0.457143	0.542857
Darai	53	20	4	0	1.000000	0.000000	1.000000	0.000000
Fox	115	24	14	4	0.422111	0.577889	0.414802	0.585198
Hixkaryana	22	6	4	0	1.000000	0.000000	1.000000	0.000000
Jaqaru	9	7	3	4	0.368421	0.631579	0.435484	0.564516
Jumjum	55	19	12	5	0.453280	0.546720	0.440136	0.559864
Karuk	25	11	4	1	0.637681	0.362319	0.736842	0.263158
Ket	196	22	0	0	NaN	NaN	NaN	NaN
Kunama	139	47	37	1	0.925985	0.074015	0.939297	0.060703
Lakota	42	20	0	2	0.000000	1.000000	0.000000	1.000000
Maricopa	16	16	0	0	NaN	NaN	NaN	NaN
Maung	255	18	69	5	0.493445	0.506555	0.543582	0.456418
Mordvin	31	9	10	1	0.743802	0.256198	0.727273	0.272727
Nocte	22	8	4	2	0.421053	0.578947	0.444444	0.555556
Reyesano	32	12	0	0	NaN	NaN	NaN	NaN
Sahu	195	24	0	0	NaN	NaN	NaN	NaN
Siuslawan	178	61	12	1	0.804396	0.195604	0.817308	0.182692
Tepehua	56	32	2	9	0.112676	0.887324	0.094244	0.905756
Thangmi	39	18	1	3	0.133333	0.866667	0.117647	0.882353
Turkana	28	10	4	0	1.000000	0.000000	1.000000	0.000000
Wardaman	67	25	8	4	0.427350	0.572650	0.435146	0.564854
Yimas	153	76	15	7	0.515604	0.484396	0.607977	0.392023

In [41]:

pd.DataFrame([pearsonr(cfp, 'Local', f'{c} (norm)') for c in ['Perc', 'Ratio', 'Cell ratio']])

Out[41]:

	r	p
Local & Perc (norm)	-0.291594	0.054795
Local & Ratio (norm)	-0.277682	0.068004
Local & Cell ratio (norm)	-0.306743	0.042843

In [42]:

cfp[[f'{c} (norm)' for c in ['Perc', 'Ratio', 'Cell ratio']]].corr()

Out[42]:

	Perc (norm)	Ratio (norm)	Cell ratio (norm)
Perc (norm)	1.000000	0.993548	0.983565
Ratio (norm)	0.993548	1.000000	0.988801
Cell ratio (norm)	0.983565	0.988801	1.000000

In [43]:

ax = (cf['Cell ratio (norm)'].sort_values(by=True, kind='mergesort')
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

#ax.get_figure().savefig('results_local_norm.pdf', bbox_inches='tight')

In [44]:

cf['Cell ratio (norm)'].sum().to_frame('Cell ratio (norm)').T

Out[44]:

Local	False	True
Cell ratio (norm)	13.337538	8.662462

In [45]:

ax = cf['Cell ratio (norm)'].dropna().T.plot(figsize=(8, 6), legend=False)

ax.legend(bbox_to_anchor=(1.35, 1.15));

In [46]:

cf['Cell ratio (norm)'].boxplot(return_type='axes');

In [47]:

fig, ax = plt.subplots()

ax.violinplot(cf['Cell ratio (norm)'].dropna().values, showmedians=True, bw_method=.25);

Direct¶

In [48]:

ctd = (df[df['Trans']]\
       .pivot_table('Position', ['Language', 'Direct'], 'Portmx', aggfunc=len, fill_value=0))

(ctd.join(ctd.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))

Out[48]:

	Portmx	False	True	odds	p
Language	Direct
Aleut	False	42	0	inf	0.000002
Aleut	True	36	21	inf	0.000002
Fox	False	53	14	0.222689	0.010231
Fox	True	68	4	0.222689	0.010231
Jumjum	False	43	8	3.455357	0.037495
Jumjum	True	14	9	3.455357	0.037495
Yimas	False	86	1	14.925620	0.000332
Yimas	True	121	21	14.925620	0.000332

In [49]:

cfd = (cells.reset_index()
       .groupby(['Language', 'Direct'])
       .agg({'Cell': 'size', 'Hasp': ntrue,
             'Markers': 'sum', 'Portmx': 'sum', 'Ratio': 'mean'})\
       .rename(columns={'Cell': 'Cells', 'Ratio': 'Cell ratio'}))

cfd.insert(2, 'Perc', 100 * cfd['Hasp'] / cfd['Cells'])
cfd.insert(5, 'Ratio', 100 * cfd['Portmx'] / cfd['Markers'])
nfd = cfd[['Perc', 'Ratio', 'Cell ratio']]
nfd /= nfd.groupby(level='Language').sum()
cfd = cfd.join(nfd, rsuffix=' (norm)')
cfd.loc[cfd['Perc (norm)'].isnull(), ['Perc', 'Ratio', 'Cell ratio']] = None

cfdp = cfd.reset_index('Direct')

cfd = cfd.unstack('Direct')

cfd.loc[:, :'Cell ratio']

Out[49]:

	Cells		Hasp		Perc		Markers		Portmx		Ratio		Cell ratio
Direct	False	True	False	True	False	True	False	True	False	True	False	True	False	True
Language
Ainu	18	19	0	2	0.000000	10.526316	29	24	0	2	0.000000	8.333333	0.000000	10.526316
Aleut	27	36	0	16	0.000000	44.444444	42	57	0	21	0.000000	36.842105	0.000000	36.111111
Ayacucho	14	18	4	4	28.571429	22.222222	47	37	4	4	8.510638	10.810811	7.738095	13.888889
Bella Coola	12	16	2	1	16.666667	6.250000	29	29	2	1	6.896552	3.448276	5.555556	3.125000
Chuckchi	11	16	1	8	9.090909	50.000000	18	23	1	8	5.555556	34.782609	4.545455	34.375000
Darai	12	16	2	2	16.666667	12.500000	26	47	2	2	7.692308	4.255319	8.333333	3.645833
Fox	18	22	14	4	77.777778	18.181818	67	72	14	4	20.895522	5.555556	22.314815	6.439394
Hixkaryana	10	18	2	2	20.000000	11.111111	10	18	2	2	20.000000	11.111111	20.000000	11.111111
Jaqaru	5	5	4	2	80.000000	40.000000	9	7	5	2	55.555556	28.571429	63.333333	30.000000
Jumjum	16	20	7	8	43.750000	40.000000	51	23	8	9	15.686275	39.130435	17.604167	37.500000
Karuk	12	16	2	3	16.666667	18.750000	18	18	2	3	11.111111	16.666667	8.333333	18.750000
Ket	28	64	0	0	NaN	NaN	76	142	0	0	NaN	NaN	NaN	NaN
Kunama	33	42	9	19	27.272727	45.238095	64	122	11	27	17.187500	22.131148	18.181818	25.396825
Lakota	14	17	0	2	0.000000	11.764706	29	33	0	2	0.000000	6.060606	0.000000	8.823529
Maricopa	12	12	0	0	NaN	NaN	20	12	0	0	NaN	NaN	NaN	NaN
Maung	34	70	12	49	35.294118	70.000000	83	190	16	58	19.277108	30.526316	18.627451	34.642857
Mordvin	12	16	3	7	25.000000	43.750000	14	26	4	7	28.571429	26.923077	22.222222	31.250000
Nocte	12	16	2	4	16.666667	25.000000	12	18	2	4	16.666667	22.222222	16.666667	25.000000
Reyesano	12	16	0	0	NaN	NaN	24	20	0	0	NaN	NaN	NaN	NaN
Sahu	34	70	0	0	NaN	NaN	73	146	0	0	NaN	NaN	NaN	NaN
Siuslawan	33	42	4	6	12.121212	14.285714	108	131	4	9	3.703704	6.870229	2.979798	4.880952
Tepehua	14	17	3	5	21.428571	29.411765	46	42	6	5	13.043478	11.904762	8.571429	7.843137
Thangmi	12	16	2	1	16.666667	6.250000	25	32	3	1	12.000000	3.125000	6.250000	1.562500
Turkana	12	16	0	4	0.000000	25.000000	18	20	0	4	0.000000	20.000000	0.000000	18.750000
Wardaman	16	19	2	9	12.500000	47.368421	47	45	3	9	6.382979	20.000000	4.687500	19.736842
Yimas	27	36	1	15	3.703704	41.666667	87	142	1	21	1.149425	14.788732	0.925926	13.224206

In [50]:

pd.DataFrame([pearsonr(cfdp, 'Direct', c) for c in ['Perc', 'Ratio', 'Cell ratio']])

Out[50]:

	r	p
Direct & Perc	0.178565	0.246163
Direct & Ratio	0.215627	0.159804
Direct & Cell ratio	0.242860	0.112182

In [51]:

ax = (cfd.sort_values(by=('Cell ratio (norm)', True), kind='mergesort')['Cell ratio']
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

for x, lang in [(19, 'Aleut'), (1, 'Fox'), (13, 'Jumjum'), (17, 'Yimas')]:
    ax.annotate('*', (x - .05, cfd.loc[lang, 'Cell ratio'].sum() + .5))

#ax.get_figure().savefig('results_direct.pdf', bbox_inches='tight')

In [52]:

(cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Direct', 'Ratio'))['r']
 .sort_values(kind='mergesort')
 .plot.bar(figsize=(15, 5)));

In [53]:

cfd[['Markers', 'Portmx', 'Ratio (norm)', 'Cell ratio (norm)']]

Out[53]:

	Markers		Portmx		Ratio (norm)		Cell ratio (norm)
Direct	False	True	False	True	False	True	False	True
Language
Ainu	29	24	0	2	0.000000	1.000000	0.000000	1.000000
Aleut	42	57	0	21	0.000000	1.000000	0.000000	1.000000
Ayacucho	47	37	4	4	0.440476	0.559524	0.357798	0.642202
Bella Coola	29	29	2	1	0.666667	0.333333	0.640000	0.360000
Chuckchi	18	23	1	8	0.137725	0.862275	0.116788	0.883212
Darai	26	47	2	2	0.643836	0.356164	0.695652	0.304348
Fox	67	72	14	4	0.789969	0.210031	0.776054	0.223946
Hixkaryana	10	18	2	2	0.642857	0.357143	0.642857	0.357143
Jaqaru	9	7	5	2	0.660377	0.339623	0.678571	0.321429
Jumjum	51	23	8	9	0.286159	0.713841	0.319471	0.680529
Karuk	18	18	2	3	0.400000	0.600000	0.307692	0.692308
Ket	76	142	0	0	NaN	NaN	NaN	NaN
Kunama	64	122	11	27	0.437134	0.562866	0.417219	0.582781
Lakota	29	33	0	2	0.000000	1.000000	0.000000	1.000000
Maricopa	20	12	0	0	NaN	NaN	NaN	NaN
Maung	83	190	16	58	0.387064	0.612936	0.349678	0.650322
Mordvin	14	26	4	7	0.514851	0.485149	0.415584	0.584416
Nocte	12	18	2	4	0.428571	0.571429	0.400000	0.600000
Reyesano	24	20	0	0	NaN	NaN	NaN	NaN
Sahu	73	146	0	0	NaN	NaN	NaN	NaN
Siuslawan	108	131	4	9	0.350267	0.649733	0.379073	0.620927
Tepehua	46	42	6	5	0.522822	0.477178	0.522184	0.477816
Thangmi	25	32	3	1	0.793388	0.206612	0.800000	0.200000
Turkana	18	20	0	4	0.000000	1.000000	0.000000	1.000000
Wardaman	47	45	3	9	0.241935	0.758065	0.191919	0.808081
Yimas	87	142	1	21	0.072118	0.927882	0.065436	0.934564

In [54]:

pd.DataFrame([pearsonr(cfdp, 'Direct', f'{c} (norm)') for c in ['Perc', 'Ratio', 'Cell ratio']])

Out[54]:

	r	p
Direct & Perc (norm)	0.406578	0.006168
Direct & Ratio (norm)	0.416424	0.004928
Direct & Cell ratio (norm)	0.457289	0.001804

In [55]:

cfdp[[f'{c} (norm)' for c in ['Perc', 'Ratio', 'Cell ratio']]].corr()

Out[55]:

	Perc (norm)	Ratio (norm)	Cell ratio (norm)
Perc (norm)	1.000000	0.958776	0.955347
Ratio (norm)	0.958776	1.000000	0.990275
Cell ratio (norm)	0.955347	0.990275	1.000000

In [56]:

ax = (cfd['Cell ratio (norm)'].sort_values(by=True, kind='mergesort')
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

#ax.get_figure().savefig('results_direct_norm.pdf', bbox_inches='tight')

In [57]:

cfd['Cell ratio (norm)'].sum().to_frame('Cell ratio (norm)').T

Out[57]:

Direct	False	True
Cell ratio (norm)	8.075977	13.924023