#!/usr/bin/env python # coding: utf-8 # ## Load segmented affix occurrences from supplement # In[1]: import functools import hashlib import io import pathlib import shutil import urllib.request import zipfile URL = 'https://zenodo.org/record/841982/files/xflr6/portmanteaus-v1.0.zip' CSV = pathlib.Path('esm3-analyses.csv') def sha256sum(filename, bufsize=32768): s = hashlib.sha256() with io.open(filename, 'rb') as fd: for data in iter(functools.partial(fd.read, bufsize), b''): s.update(data) return s.hexdigest() if not CSV.exists(): with io.BytesIO() as b: with urllib.request.urlopen(URL) as u: shutil.copyfileobj(u, b) with zipfile.ZipFile(b) as z: i, = (i for i in z.infolist() if i.filename.endswith(CSV.name)) i.filename = CSV.name z.extract(i) sha256sum(CSV) # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np import pandas as pd import scipy.stats def pearsonr(df, left, right, func=scipy.stats.pearsonr): df = df[[left, right]].dropna() name = f'{left} & {right}' with np.errstate(invalid='ignore'): result = func(df[left], df[right]) return pd.Series(result, index=('r', 'p'), name=name) plt.style.use('classic') plt.rcParams.update({'figure.figsize': (6, 4), 'figure.facecolor': 'w', 'figure.subplot.bottom': .125, 'font.size': 10, 'savefig.dpi': 72}) # In[3]: RENAME = {'Quechua (Ayacucho)': 'Ayacucho', 'Tlachichilco Tepehuan': 'Tepehua', 'Lakhota': 'Lakota'} cf = pd.read_csv(CSV, encoding='utf-8') cf['Language'] = cf['Language'].replace(RENAME) cf = cf.sort_values(by='Language', kind='mergesort').reset_index(drop=True) cf.info() assert cf.set_index(['Language', 'Cell', 'Position']).index.is_unique cf.head(10) # ## Reconcatenate word-forms with stem symbol # In[4]: STEM = 'Σ' assert not cf['Form'].str.contains(STEM).any() _cf = (cf.drop('Meaning', axis=1) .assign(cell_index=lambda x: x.groupby(['Language', 'Cell'], sort=False).ngroup())) _sf = (_cf.drop_duplicates('cell_index') .assign(Position=0, Form=STEM)) df = (pd.concat([_cf, _sf]) .sort_values(by=['cell_index', 'Position']) .groupby(['cell_index', 'Language', 'Cell'])[['Form']] .agg(''.join) .reset_index('cell_index', drop=True)) df.info() assert df.index.is_unique df.head(10) # ## Tag cells as 1/2<->1/2, 1/2<->3, and other # In[5]: SEP = '->' def is_distinct_local(cellkey, sep=SEP, persons=('1', '2')): subj, trans, obj = cellkey.partition(sep) local_subj, local_obj = (any(p in arg for p in persons) for arg in (subj, obj)) if local_subj and local_obj: return True elif trans and (local_subj or local_obj): return False else: return None df.insert(0, 'd_local', df.index.get_level_values('Cell').map(is_distinct_local)) df.info() assert df.index.is_unique df.head(30) # ## Neutralization ratios (1/2<->3 vs. 1/2<->1/2) # In[6]: xf = (df.dropna(subset=['d_local']) .groupby(['Language', 'd_local'])['Form'] .agg(['size', 'nunique'])) xf['neut'] = xf['size'] - xf['nunique'] xf['ratio'] = 100 * xf['neut'] / (xf['size'] - 1) xf['ratio (norm)'] = xf['ratio'] / xf['ratio'].groupby(level='Language').sum() xf.loc[xf['ratio (norm)'].isnull(), 'ratio'] = None xfp = xf.reset_index('d_local') xf = xf.unstack() xf # In[7]: (xf.sort_values(by=('ratio (norm)', True), kind='mergesort')['ratio'] .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)); # In[8]: (xf['ratio (norm)'].sort_values(by=True, kind='mergesort') .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)); # ## Test for neutralization differences # In[9]: pd.DataFrame([pearsonr(xfp, 'd_local', c) for c in ['ratio', 'ratio (norm)']]) # ## Count 1/2 subcategory (number, gender) neutralizations # In[10]: ff = df.dropna(subset=['d_local']).reset_index() ff[['A', 'P']] = ff['Cell'].str.partition(SEP)[[0, 2]] print(pd.concat([ff['A'], ff['P']]).unique()) a_first, p_first = (ff[x].str.contains(r'1') for x in ('A', 'P')) a_second, p_second = (ff[x].str.contains(r'2|[dp]i') for x in ('A', 'P')) a_third, p_third = ~a_first & ~a_second, ~p_first & ~p_second # treat inclusive cells as first person only a_second &= ~a_first; p_second &= ~p_first assert (pd.concat([a_first, a_second, a_third], axis=1).sum(axis=1) == 1).all() assert (pd.concat([p_first, p_second, p_third], axis=1).sum(axis=1) == 1).all() groups = {'1->X:P': a_first & p_third, '2->X:P': a_second & p_third, 'X->1:A': a_third & p_first, 'X->2:A': a_third & p_second, # '1->2:A': a_first & p_second, '2->1:A': a_second & p_first, '1->2:P': a_first & p_second, '2->1:P': a_second & p_first} lf = (pd.concat([ff[c].groupby(['Language', 'd_local', g.rpartition(':')[-1]])['Form'] .agg(['size', 'nunique']) .assign(group=g) .set_index('group', append=True) .swaplevel() .reset_index('d_local') for g, c in groups.items()]) .sort_index()) lf.index.rename('X', level=2, inplace=True) lf['neut'] = lf['size'] - lf['nunique'] lf['ratio'] = 100 * lf['neut'] / (lf['size'] - 1) lf.head(14) # ## 1/2 subcategory neutralization ratios (1/2<->3 vs. 1/2<->1/2) # In[11]: rf = lf.pivot_table('ratio', ['Language', 'd_local'], aggfunc='mean') rf['ratio (norm)'] = rf['ratio'] / rf['ratio'].groupby(level='Language').sum() rf.loc[rf['ratio (norm)'].isnull(), 'ratio'] = None rfp = rf.reset_index('d_local') rf = rf.unstack() rf # In[12]: (rf.sort_values(by=('ratio (norm)', True), kind='mergesort')['ratio'] .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)); # In[13]: (rf['ratio (norm)'].sort_values(by=True, kind='mergesort') .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)); # ## Test for 1/2 subcategory neutralization differences # In[14]: pd.DataFrame([pearsonr(rfp, 'd_local', c) for c in ['ratio', 'ratio (norm)']]) # ## Absense of non-person features in learned meanings # In[15]: cf['Meaning'].str.extractall(r'([+-]\w+)')[0].value_counts().to_frame('n') # In[16]: NONPERSON = r'[+-]\D+\b' nf = cf.assign(PersonOnly=lambda x: ~x['Meaning'].str.contains(NONPERSON)) nf.head(14) # In[17]: cnf = (nf.groupby(['Language', 'Cell'], sort=False)['PersonOnly'].all() .to_frame('PersonOnly')) cnf.insert(0, 'd_local', cnf.index.get_level_values('Cell').map(is_distinct_local)) cnf.head(14) # In[18]: xnf = (cnf.dropna(subset=['d_local']) .groupby(['Language', 'd_local']).mean() .rename(columns={'PersonOnly': 'ratio'})) xnf['ratio (norm)'] = xnf['ratio'] / xnf['ratio'].groupby(level='Language').sum() xnf.loc[xnf['ratio (norm)'].isnull(), 'ratio'] = None xnfp = xnf.reset_index('d_local') xnf = xnf.unstack() xnf # In[19]: (xnf['ratio (norm)'].sort_values(by=True, kind='mergesort') .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)); # In[20]: pd.DataFrame([pearsonr(xnfp, 'd_local', c) for c in ['ratio', 'ratio (norm)']])