#!/usr/bin/env python # coding: utf-8 # In[1]: import functools import hashlib import io import pathlib import shutil import urllib.request import zipfile URL = 'https://zenodo.org/record/841982/files/xflr6/portmanteaus-v1.0.zip' CSV = pathlib.Path('esm3-analyses.csv') def sha256sum(filename, bufsize=32768) -> str: s = hashlib.sha256() with io.open(filename, 'rb') as fd: for data in iter(functools.partial(fd.read, bufsize), b''): s.update(data) return s.hexdigest() if not CSV.exists(): with io.BytesIO() as b: with urllib.request.urlopen(URL) as u: shutil.copyfileobj(u, b) with zipfile.ZipFile(b) as z: i, = (i for i in z.infolist() if i.filename.endswith(CSV.name)) i.filename = CSV.name z.extract(i) sha256sum(CSV) # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import types import warnings import matplotlib.pyplot as plt import numpy as np from numpy import count_nonzero as ntrue import pandas as pd import scipy.stats def crosstab(df, index, columns, tests=False, margins=True, **kwargs): idx = [df[i] for i in index] if isinstance(index, list) else df[index] cols = [df[c] for c in columns] if isinstance(columns, list) else df[columns] result = pd.crosstab(idx, cols, margins=margins, **kwargs) if tests: result = types.SimpleNamespace(df=result) values = result.df.iloc[:-1, :-1] if margins else result.df testres = chi2_contingency(values), fisher_exact(values) label = tests if isinstance(tests, str) else f'{index} & {columns}' testres = [tr.to_frame(label).T for tr in testres] result.expected = pd.DataFrame( testres[0].at[label, 'expected'], index=values.index, columns=values.columns) result.chi2, result.fisher_exact = testres return result def chi2_contingency(x, func=scipy.stats.chi2_contingency): try: chi2, p, dof, expected = func(x) except ValueError: return result = chi2, p, dof, expected, (expected >= 5).all() return pd.Series(result, index=('chi2', 'p', 'dof', 'expected', 'usable')) def fisher_exact(x, func=scipy.stats.fisher_exact): return pd.Series(func(x), index=('odds', 'p')) def pearsonr(df, left, right, func=scipy.stats.pearsonr): df = df[[left, right]].dropna() name = f'{left} & {right}' with warnings.catch_warnings(): warnings.simplefilter('ignore', scipy.stats.ConstantInputWarning) with np.errstate(invalid='ignore'): result = func(df[left], df[right]) return pd.Series(result, index=('r', 'p'), name=name) plt.style.use('classic') plt.rcParams.update({'figure.figsize': (6, 4), 'figure.facecolor': 'w', 'figure.subplot.bottom': .125, 'font.size': 10, 'savefig.dpi': 72}) # In[3]: RENAME = {'Quechua (Ayacucho)': 'Ayacucho', 'Tlachichilco Tepehuan': 'Tepehua', 'Lakhota': 'Lakota'} df = pd.read_csv(CSV, encoding='utf-8') df['Language'] = df['Language'].replace(RENAME) df.info() assert df.set_index(['Language', 'Cell', 'Position']).index.is_unique df.head(10) # In[4]: import itertools SEP = '->' def is_local(cellkey, sep=SEP, persons=('1', '2')) -> bool: subj, _, obj = cellkey.partition(sep) return any(p in subj for p in persons) and any(p in obj for p in persons) def is_inverse(cellkey, sep=SEP, hierarchy=('1', '2', '3', '4', 'x')) -> bool: subj, _, obj = cellkey.partition(sep) for high, low in itertools.combinations(hierarchy, 2): if low in subj and high in obj: return True return False def is_inverse_restricted(cellkey, sep=SEP, high=('1', '2'), low=('3', '4', 'x')) -> bool: subj, _, obj = cellkey.partition(sep) return any(l in subj for l in low) and any(h in obj for h in high) def is_inverse_relaxed(cellkey, sep=SEP, high=('1', '2')) -> bool: subj, _, obj = cellkey.partition(sep) return any(h in obj for h in high) def get_slot(cellpos) -> bool: return 'prefix' if cellpos < 0 else 'suffix' if cellpos > 0 else 'stem' df.insert(2, 'Trans', df['Cell'].str.contains(SEP)) df.insert(3, 'Local', df['Cell'].map(is_local)) df.insert(4, 'Direct', ~df['Cell'].map(is_inverse)) df.insert(6, 'Slot', df['Position'].map(get_slot)) df.insert(9, 'Portmx', df['Meaning'].str.contains(SEP)) df.head(10) # In[5]: df['Language'].nunique() # In[6]: df.pivot_table('Position', 'Trans', 'Local', aggfunc=len, fill_value=0, margins=True) # In[7]: (df.groupby(['Language', 'Cell'])[['Trans', 'Local']].first() .pivot_table(index='Trans', columns='Local', aggfunc=len, fill_value=0, margins=True)) # In[8]: df.groupby('Language')['Cell'].nunique().to_frame() # In[9]: (df.groupby(['Language', 'Cell'])['Position'].size() .groupby(level='Language').mean() .to_frame('marker/cell') .sort_values(by='marker/cell') .describe()) # In[10]: (df.groupby(['Language', 'Cell'])['Position'].size() .groupby(level='Language').mean() .std(ddof=0)) # ## Learned lexemes # In[11]: assert (df.groupby(['Language', 'Form', 'Meaning'])[['Slot', 'Portmx']] .filter(lambda g: (g.nunique() > 1).any()) .empty) (df.groupby(['Language', 'Form', 'Meaning'])[['Slot', 'Portmx']].first() .pivot_table(index='Slot', columns='Portmx', aggfunc=len, fill_value=0, margins=True)) # In[12]: (df.groupby(['Language', 'Form', 'Meaning'], as_index=False)['Portmx'].first() .pivot_table('Form', 'Language', 'Portmx', aggfunc=len, fill_value=0, margins=True)) # In[13]: llf = (df.groupby(['Language', 'Form', 'Meaning', 'Portmx'], as_index=False)['Local'] .agg(lambda x: True if x.all() else False if not x.any() else None) .query('Local == Local') .pivot_table('Form', ['Language', 'Local'], 'Portmx', aggfunc=len, fill_value=0, dropna=False)) (llf.join(llf.groupby(level='Language').apply(fisher_exact)) .rename_axis('Portmx', axis=1) .query('p <= .05')) # In[14]: dlf = (df.groupby(['Language', 'Form', 'Meaning', 'Portmx'], as_index=False)['Direct'] .agg(lambda x: True if x.all() else False if not x.any() else None) .query('Direct == Direct') # filter out Nan .pivot_table('Form', ['Language', 'Direct'], 'Portmx', aggfunc=len, fill_value=0, dropna=False)) (dlf.join(llf.groupby(level='Language').apply(fisher_exact)) .rename_axis('Portmx', axis=1) .query('p <= .05')) # ## Transitive occurrences # In[15]: ct = crosstab(df[df['Trans']], 'Local', 'Portmx', tests=True) ct.df # In[16]: ct.expected # In[17]: ct.chi2.drop('expected', axis=1) # In[18]: ct.fisher_exact # In[19]: of = (df[df['Trans']] .groupby(['Language', 'Local'])['Portmx'] .agg([('Markers', 'size'), ('Portmx', ntrue)])) of['Ratio'] = 100 * of['Portmx'] / of['Markers'] of['Ratio (norm)'] = of['Ratio'] / of['Ratio'].groupby(level='Language').sum() of.loc[of['Ratio (norm)'].isnull(), 'Ratio'] = None ofp = of.reset_index('Local') of.unstack('Local') # In[20]: pd.DataFrame([pearsonr(ofp, 'Local', c) for c in ['Ratio', 'Ratio (norm)']]) # In[21]: ctl = (df[df['Trans']] .pivot_table('Position', ['Language', 'Local'], 'Portmx', aggfunc=len, fill_value=0)) (ctl.join(ctl.groupby(level='Language').apply(chi2_contingency)) .rename_axis('Portmx', axis=1) .query('usable == 1 & p <= .05')) # In[22]: (ctl.join(ctl.groupby(level='Language').apply(fisher_exact)) .rename_axis('Portmx', axis=1) .query('p <= .05')) # ## Transitive cells # In[23]: cells = (df[df['Trans']] .groupby(['Language', 'Cell', 'Local', 'Direct'], sort=False)['Portmx'] .agg([('Markers', 'size'), ('Portmx', ntrue), ('Hasp', 'any')]) .reset_index(['Local', 'Direct'])) cells['Ratio'] = 100 * cells['Portmx'] / cells['Markers'] cells # In[24]: cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Ratio', 'Markers')) # In[25]: ct = crosstab(cells, 'Local', 'Hasp', tests=True) ct.df # In[26]: ct.expected # In[27]: ct.chi2.drop('expected', axis=1) # In[28]: ct.fisher_exact # In[29]: pd.DataFrame([pearsonr(cells, 'Local', c) for c in ['Hasp', 'Ratio']]) # In[30]: ctl = cells.pivot_table('Markers', ['Language', 'Local'], 'Hasp', aggfunc=len, fill_value=0) (ctl.join(ctl.groupby(level='Language').apply(chi2_contingency)) .rename_axis('Hasp', axis=1) .query('usable == 1 & p <= .05')) # In[31]: (ctl.join(ctl.groupby(level='Language').apply(fisher_exact)) .rename_axis('Hasp', axis=1) .query('p <= .05')) # ## Cells with portmanteau vs. Occurrences vs. Mean cell ratio # In[32]: cf = (cells.reset_index() .groupby(['Language', 'Local']) .agg({'Cell': 'size', 'Hasp': ntrue, 'Markers': 'sum', 'Portmx': 'sum', 'Ratio': 'mean'}) .rename(columns={'Cell': 'Cells', 'Ratio': 'Cell ratio'})) cf.insert(2, 'Perc', 100 * cf['Hasp'] / cf['Cells']) cf.insert(5, 'Ratio', 100 * cf['Portmx'] / cf['Markers']) nf = cf[['Perc', 'Ratio', 'Cell ratio']] nf /= nf.groupby(level='Language').sum() cf = cf.join(nf, rsuffix=' (norm)') cf.loc[cf['Perc (norm)'].isnull(), ['Perc', 'Ratio', 'Cell ratio']] = None cfp = cf.reset_index('Local') cf = cf.unstack('Local') cf.loc[:, :'Cell ratio'] # In[33]: # Point-biserial correlation = Pearson r pd.DataFrame([pearsonr(cfp, 'Local', c) for c in ['Perc', 'Ratio', 'Cell ratio']]) # In[34]: cfp[['Perc', 'Ratio', 'Cell ratio']].corr() # In[35]: ax = (cf.sort_values(by=('Cell ratio (norm)', True), kind='mergesort')['Cell ratio'] .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)) for x, lang in [(1, 'Aleut'), (6, 'Kunama'), (20, 'Tepehua')]: ax.annotate('*', (x - .05, cf.loc[lang, 'Cell ratio'].sum() + .5)) #ax.get_figure().savefig('results_local.pdf', bbox_inches='tight') # In[36]: (cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Local', 'Ratio'))['r'] .sort_values(kind='mergesort') .plot.bar(figsize=(15, 5))); # In[37]: ax = cf['Cell ratio'].dropna().T.plot(figsize=(8, 6), legend=False) ax.legend(bbox_to_anchor=(1.35, 1.15)); # In[38]: cf['Cell ratio'].boxplot(return_type='axes'); # In[39]: fig, ax = plt.subplots() ax.violinplot(cf['Cell ratio'].dropna().values, showmedians=True, bw_method=.25); # In[40]: cf[['Markers', 'Portmx', 'Ratio (norm)', 'Cell ratio (norm)']] # In[41]: pd.DataFrame([pearsonr(cfp, 'Local', f'{c} (norm)') for c in ['Perc', 'Ratio', 'Cell ratio']]) # In[42]: cfp[[f'{c} (norm)' for c in ['Perc', 'Ratio', 'Cell ratio']]].corr() # In[43]: ax = (cf['Cell ratio (norm)'].sort_values(by=True, kind='mergesort') .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)) #ax.get_figure().savefig('results_local_norm.pdf', bbox_inches='tight') # In[44]: cf['Cell ratio (norm)'].sum().to_frame('Cell ratio (norm)').T # In[45]: ax = cf['Cell ratio (norm)'].dropna().T.plot(figsize=(8, 6), legend=False) ax.legend(bbox_to_anchor=(1.35, 1.15)); # In[46]: cf['Cell ratio (norm)'].boxplot(return_type='axes'); # In[47]: fig, ax = plt.subplots() ax.violinplot(cf['Cell ratio (norm)'].dropna().values, showmedians=True, bw_method=.25); # # Direct # In[48]: ctd = (df[df['Trans']]\ .pivot_table('Position', ['Language', 'Direct'], 'Portmx', aggfunc=len, fill_value=0)) (ctd.join(ctd.groupby(level='Language').apply(fisher_exact)) .rename_axis('Portmx', axis=1) .query('p <= .05')) # In[49]: cfd = (cells.reset_index() .groupby(['Language', 'Direct']) .agg({'Cell': 'size', 'Hasp': ntrue, 'Markers': 'sum', 'Portmx': 'sum', 'Ratio': 'mean'})\ .rename(columns={'Cell': 'Cells', 'Ratio': 'Cell ratio'})) cfd.insert(2, 'Perc', 100 * cfd['Hasp'] / cfd['Cells']) cfd.insert(5, 'Ratio', 100 * cfd['Portmx'] / cfd['Markers']) nfd = cfd[['Perc', 'Ratio', 'Cell ratio']] nfd /= nfd.groupby(level='Language').sum() cfd = cfd.join(nfd, rsuffix=' (norm)') cfd.loc[cfd['Perc (norm)'].isnull(), ['Perc', 'Ratio', 'Cell ratio']] = None cfdp = cfd.reset_index('Direct') cfd = cfd.unstack('Direct') cfd.loc[:, :'Cell ratio'] # In[50]: pd.DataFrame([pearsonr(cfdp, 'Direct', c) for c in ['Perc', 'Ratio', 'Cell ratio']]) # In[51]: ax = (cfd.sort_values(by=('Cell ratio (norm)', True), kind='mergesort')['Cell ratio'] .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)) for x, lang in [(19, 'Aleut'), (1, 'Fox'), (13, 'Jumjum'), (17, 'Yimas')]: ax.annotate('*', (x - .05, cfd.loc[lang, 'Cell ratio'].sum() + .5)) #ax.get_figure().savefig('results_direct.pdf', bbox_inches='tight') # In[52]: (cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Direct', 'Ratio'))['r'] .sort_values(kind='mergesort') .plot.bar(figsize=(15, 5))); # In[53]: cfd[['Markers', 'Portmx', 'Ratio (norm)', 'Cell ratio (norm)']] # In[54]: pd.DataFrame([pearsonr(cfdp, 'Direct', f'{c} (norm)') for c in ['Perc', 'Ratio', 'Cell ratio']]) # In[55]: cfdp[[f'{c} (norm)' for c in ['Perc', 'Ratio', 'Cell ratio']]].corr() # In[56]: ax = (cfd['Cell ratio (norm)'].sort_values(by=True, kind='mergesort') .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray)) #ax.get_figure().savefig('results_direct_norm.pdf', bbox_inches='tight') # In[57]: cfd['Cell ratio (norm)'].sum().to_frame('Cell ratio (norm)').T