#!/usr/bin/env python
# coding: utf-8

# In[1]:


import functools
import hashlib
import io
import pathlib
import shutil
import urllib.request
import zipfile

URL = 'https://zenodo.org/record/841982/files/xflr6/portmanteaus-v1.0.zip'

CSV = pathlib.Path('esm3-analyses.csv')


def sha256sum(filename, bufsize=32768) -> str:
    s = hashlib.sha256()
    with io.open(filename, 'rb') as fd:
        for data in iter(functools.partial(fd.read, bufsize), b''):
            s.update(data)
    return s.hexdigest()


if not CSV.exists():
    with io.BytesIO() as b:
        with urllib.request.urlopen(URL) as u:
            shutil.copyfileobj(u, b)
        with zipfile.ZipFile(b) as z:
            i, = (i for i in z.infolist() if i.filename.endswith(CSV.name))
            i.filename = CSV.name
            z.extract(i)

sha256sum(CSV)


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')

import types
import warnings

import matplotlib.pyplot as plt
import numpy as np
from numpy import count_nonzero as ntrue
import pandas as pd
import scipy.stats


def crosstab(df, index, columns, tests=False, margins=True, **kwargs):
    idx = [df[i] for i in index] if isinstance(index, list) else df[index]
    cols = [df[c] for c in columns] if isinstance(columns, list) else df[columns]
    result = pd.crosstab(idx, cols, margins=margins, **kwargs)
    if tests:
        result = types.SimpleNamespace(df=result)
        values = result.df.iloc[:-1, :-1] if margins else result.df
        testres = chi2_contingency(values), fisher_exact(values)
        label = tests if isinstance(tests, str) else f'{index} & {columns}'
        testres = [tr.to_frame(label).T for tr in testres]
        result.expected = pd.DataFrame(
            testres[0].at[label, 'expected'],
            index=values.index, columns=values.columns)
        result.chi2, result.fisher_exact = testres
    return result


def chi2_contingency(x, func=scipy.stats.chi2_contingency):
    try:
        chi2, p, dof, expected = func(x)
    except ValueError:
        return
    result = chi2, p, dof, expected, (expected >= 5).all()
    return pd.Series(result, index=('chi2', 'p', 'dof', 'expected', 'usable'))


def fisher_exact(x, func=scipy.stats.fisher_exact):
    return pd.Series(func(x), index=('odds', 'p'))


def pearsonr(df, left, right, func=scipy.stats.pearsonr):
    df = df[[left, right]].dropna()
    name = f'{left} & {right}'
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', scipy.stats.ConstantInputWarning)
        with np.errstate(invalid='ignore'):
            result = func(df[left], df[right])
    return pd.Series(result, index=('r', 'p'), name=name)


plt.style.use('classic')
plt.rcParams.update({'figure.figsize': (6, 4), 'figure.facecolor': 'w',
                     'figure.subplot.bottom': .125, 'font.size': 10, 'savefig.dpi': 72})


# In[3]:


RENAME = {'Quechua (Ayacucho)': 'Ayacucho',
          'Tlachichilco Tepehuan': 'Tepehua',
          'Lakhota': 'Lakota'}


df = pd.read_csv(CSV, encoding='utf-8')

df['Language'] = df['Language'].replace(RENAME)

df.info()
assert df.set_index(['Language', 'Cell', 'Position']).index.is_unique
df.head(10)


# In[4]:


import itertools

SEP = '->'


def is_local(cellkey, sep=SEP, persons=('1', '2')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    return any(p in subj for p in persons) and any(p in obj for p in persons)


def is_inverse(cellkey, sep=SEP, hierarchy=('1', '2', '3', '4', 'x')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    for high, low in itertools.combinations(hierarchy, 2):
        if low in subj and high in obj:
            return True
    return False


def is_inverse_restricted(cellkey, sep=SEP, high=('1', '2'), low=('3', '4', 'x')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    return any(l in subj for l in low) and any(h in obj for h in high)


def is_inverse_relaxed(cellkey, sep=SEP, high=('1', '2')) -> bool:
    subj, _, obj = cellkey.partition(sep)
    return any(h in obj for h in high)


def get_slot(cellpos) -> bool:
    return 'prefix' if cellpos < 0 else 'suffix' if cellpos > 0 else 'stem'


df.insert(2, 'Trans', df['Cell'].str.contains(SEP))
df.insert(3, 'Local', df['Cell'].map(is_local))
df.insert(4, 'Direct', ~df['Cell'].map(is_inverse))
df.insert(6, 'Slot', df['Position'].map(get_slot))
df.insert(9, 'Portmx', df['Meaning'].str.contains(SEP))

df.head(10)


# In[5]:


df['Language'].nunique()


# In[6]:


df.pivot_table('Position', 'Trans', 'Local', aggfunc=len, fill_value=0, margins=True)


# In[7]:


(df.groupby(['Language', 'Cell'])[['Trans', 'Local']].first()
 .pivot_table(index='Trans', columns='Local', aggfunc=len, fill_value=0, margins=True))


# In[8]:


df.groupby('Language')['Cell'].nunique().to_frame()


# In[9]:


(df.groupby(['Language', 'Cell'])['Position'].size()
 .groupby(level='Language').mean()
 .to_frame('marker/cell')
 .sort_values(by='marker/cell')
 .describe())


# In[10]:


(df.groupby(['Language', 'Cell'])['Position'].size()
 .groupby(level='Language').mean()
 .std(ddof=0))


# ## Learned lexemes

# In[11]:


assert (df.groupby(['Language', 'Form', 'Meaning'])[['Slot', 'Portmx']]
        .filter(lambda g: (g.nunique() > 1).any())
        .empty)

(df.groupby(['Language', 'Form', 'Meaning'])[['Slot', 'Portmx']].first()
 .pivot_table(index='Slot', columns='Portmx', aggfunc=len, fill_value=0, margins=True))


# In[12]:


(df.groupby(['Language', 'Form', 'Meaning'], as_index=False)['Portmx'].first()
 .pivot_table('Form', 'Language', 'Portmx', aggfunc=len, fill_value=0, margins=True))


# In[13]:


llf = (df.groupby(['Language', 'Form', 'Meaning', 'Portmx'], as_index=False)['Local']
       .agg(lambda x: True if x.all() else False if not x.any() else None)
       .query('Local == Local')
       .pivot_table('Form', ['Language', 'Local'], 'Portmx', aggfunc=len, fill_value=0, dropna=False))

(llf.join(llf.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))


# In[14]:


dlf = (df.groupby(['Language', 'Form', 'Meaning', 'Portmx'], as_index=False)['Direct']
       .agg(lambda x: True if x.all() else False if not x.any() else None)
       .query('Direct == Direct')  # filter out Nan
       .pivot_table('Form', ['Language', 'Direct'], 'Portmx', aggfunc=len, fill_value=0, dropna=False))

(dlf.join(llf.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))


# ## Transitive occurrences

# In[15]:


ct = crosstab(df[df['Trans']], 'Local', 'Portmx', tests=True)
ct.df


# In[16]:


ct.expected


# In[17]:


ct.chi2.drop('expected', axis=1)


# In[18]:


ct.fisher_exact


# In[19]:


of = (df[df['Trans']]
      .groupby(['Language', 'Local'])['Portmx']
      .agg([('Markers', 'size'), ('Portmx', ntrue)]))

of['Ratio'] = 100 * of['Portmx'] / of['Markers']
of['Ratio (norm)'] = of['Ratio'] / of['Ratio'].groupby(level='Language').sum()
of.loc[of['Ratio (norm)'].isnull(), 'Ratio'] = None

ofp = of.reset_index('Local')

of.unstack('Local')


# In[20]:


pd.DataFrame([pearsonr(ofp, 'Local', c) for c in ['Ratio', 'Ratio (norm)']])


# In[21]:


ctl = (df[df['Trans']]
       .pivot_table('Position', ['Language', 'Local'], 'Portmx', aggfunc=len, fill_value=0))

(ctl.join(ctl.groupby(level='Language').apply(chi2_contingency))
 .rename_axis('Portmx', axis=1)
 .query('usable == 1 & p <= .05'))


# In[22]:


(ctl.join(ctl.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))


# ## Transitive cells

# In[23]:


cells = (df[df['Trans']]
         .groupby(['Language', 'Cell', 'Local', 'Direct'], sort=False)['Portmx']
         .agg([('Markers', 'size'), ('Portmx', ntrue), ('Hasp', 'any')])
         .reset_index(['Local', 'Direct']))

cells['Ratio'] = 100 * cells['Portmx'] / cells['Markers']

cells


# In[24]:


cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Ratio', 'Markers'))


# In[25]:


ct = crosstab(cells, 'Local', 'Hasp', tests=True)

ct.df


# In[26]:


ct.expected


# In[27]:


ct.chi2.drop('expected', axis=1)


# In[28]:


ct.fisher_exact


# In[29]:


pd.DataFrame([pearsonr(cells, 'Local', c) for c in ['Hasp', 'Ratio']])


# In[30]:


ctl = cells.pivot_table('Markers', ['Language', 'Local'], 'Hasp', aggfunc=len, fill_value=0)

(ctl.join(ctl.groupby(level='Language').apply(chi2_contingency))
 .rename_axis('Hasp', axis=1)
 .query('usable == 1 & p <= .05'))


# In[31]:


(ctl.join(ctl.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Hasp', axis=1)
 .query('p <= .05'))


# ## Cells with portmanteau vs. Occurrences vs. Mean cell ratio

# In[32]:


cf = (cells.reset_index()
      .groupby(['Language', 'Local'])
      .agg({'Cell': 'size', 'Hasp': ntrue, 'Markers': 'sum',
            'Portmx': 'sum', 'Ratio': 'mean'})
      .rename(columns={'Cell': 'Cells', 'Ratio': 'Cell ratio'}))

cf.insert(2, 'Perc', 100 * cf['Hasp'] / cf['Cells'])
cf.insert(5, 'Ratio', 100 * cf['Portmx'] / cf['Markers'])
nf = cf[['Perc', 'Ratio', 'Cell ratio']]
nf /= nf.groupby(level='Language').sum()
cf = cf.join(nf, rsuffix=' (norm)')
cf.loc[cf['Perc (norm)'].isnull(), ['Perc', 'Ratio', 'Cell ratio']] = None

cfp = cf.reset_index('Local')

cf = cf.unstack('Local')

cf.loc[:, :'Cell ratio']


# In[33]:


# Point-biserial correlation = Pearson r
pd.DataFrame([pearsonr(cfp, 'Local', c) for c in ['Perc', 'Ratio', 'Cell ratio']])


# In[34]:


cfp[['Perc', 'Ratio', 'Cell ratio']].corr()


# In[35]:


ax = (cf.sort_values(by=('Cell ratio (norm)', True), kind='mergesort')['Cell ratio']
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

for x, lang in [(1, 'Aleut'), (6, 'Kunama'), (20, 'Tepehua')]:
    ax.annotate('*', (x - .05, cf.loc[lang, 'Cell ratio'].sum() + .5))

#ax.get_figure().savefig('results_local.pdf', bbox_inches='tight')


# In[36]:


(cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Local', 'Ratio'))['r']
 .sort_values(kind='mergesort')
 .plot.bar(figsize=(15, 5)));


# In[37]:


ax = cf['Cell ratio'].dropna().T.plot(figsize=(8, 6), legend=False)
ax.legend(bbox_to_anchor=(1.35, 1.15));


# In[38]:


cf['Cell ratio'].boxplot(return_type='axes');


# In[39]:


fig, ax = plt.subplots()

ax.violinplot(cf['Cell ratio'].dropna().values, showmedians=True, bw_method=.25);


# In[40]:


cf[['Markers', 'Portmx', 'Ratio (norm)', 'Cell ratio (norm)']]


# In[41]:


pd.DataFrame([pearsonr(cfp, 'Local', f'{c} (norm)') for c in ['Perc', 'Ratio', 'Cell ratio']])


# In[42]:


cfp[[f'{c} (norm)' for c in ['Perc', 'Ratio', 'Cell ratio']]].corr()


# In[43]:


ax = (cf['Cell ratio (norm)'].sort_values(by=True, kind='mergesort')
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

#ax.get_figure().savefig('results_local_norm.pdf', bbox_inches='tight')


# In[44]:


cf['Cell ratio (norm)'].sum().to_frame('Cell ratio (norm)').T


# In[45]:


ax = cf['Cell ratio (norm)'].dropna().T.plot(figsize=(8, 6), legend=False)

ax.legend(bbox_to_anchor=(1.35, 1.15));


# In[46]:


cf['Cell ratio (norm)'].boxplot(return_type='axes');


# In[47]:


fig, ax = plt.subplots()

ax.violinplot(cf['Cell ratio (norm)'].dropna().values, showmedians=True, bw_method=.25);


# # Direct

# In[48]:


ctd = (df[df['Trans']]\
       .pivot_table('Position', ['Language', 'Direct'], 'Portmx', aggfunc=len, fill_value=0))

(ctd.join(ctd.groupby(level='Language').apply(fisher_exact))
 .rename_axis('Portmx', axis=1)
 .query('p <= .05'))


# In[49]:


cfd = (cells.reset_index()
       .groupby(['Language', 'Direct'])
       .agg({'Cell': 'size', 'Hasp': ntrue,
             'Markers': 'sum', 'Portmx': 'sum', 'Ratio': 'mean'})\
       .rename(columns={'Cell': 'Cells', 'Ratio': 'Cell ratio'}))

cfd.insert(2, 'Perc', 100 * cfd['Hasp'] / cfd['Cells'])
cfd.insert(5, 'Ratio', 100 * cfd['Portmx'] / cfd['Markers'])
nfd = cfd[['Perc', 'Ratio', 'Cell ratio']]
nfd /= nfd.groupby(level='Language').sum()
cfd = cfd.join(nfd, rsuffix=' (norm)')
cfd.loc[cfd['Perc (norm)'].isnull(), ['Perc', 'Ratio', 'Cell ratio']] = None

cfdp = cfd.reset_index('Direct')

cfd = cfd.unstack('Direct')

cfd.loc[:, :'Cell ratio']


# In[50]:


pd.DataFrame([pearsonr(cfdp, 'Direct', c) for c in ['Perc', 'Ratio', 'Cell ratio']])


# In[51]:


ax = (cfd.sort_values(by=('Cell ratio (norm)', True), kind='mergesort')['Cell ratio']
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

for x, lang in [(19, 'Aleut'), (1, 'Fox'), (13, 'Jumjum'), (17, 'Yimas')]:
    ax.annotate('*', (x - .05, cfd.loc[lang, 'Cell ratio'].sum() + .5))

#ax.get_figure().savefig('results_direct.pdf', bbox_inches='tight')


# In[52]:


(cells.groupby(level='Language').apply(lambda x: pearsonr(x, 'Direct', 'Ratio'))['r']
 .sort_values(kind='mergesort')
 .plot.bar(figsize=(15, 5)));


# In[53]:


cfd[['Markers', 'Portmx', 'Ratio (norm)', 'Cell ratio (norm)']]


# In[54]:


pd.DataFrame([pearsonr(cfdp, 'Direct', f'{c} (norm)') for c in ['Perc', 'Ratio', 'Cell ratio']])


# In[55]:


cfdp[[f'{c} (norm)' for c in ['Perc', 'Ratio', 'Cell ratio']]].corr()


# In[56]:


ax = (cfd['Cell ratio (norm)'].sort_values(by=True, kind='mergesort')
      .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray))

#ax.get_figure().savefig('results_direct_norm.pdf', bbox_inches='tight')


# In[57]:


cfd['Cell ratio (norm)'].sum().to_frame('Cell ratio (norm)').T