#!/usr/bin/env python
# coding: utf-8

# ## Load segmented affix occurrences from supplement

# In[1]:


import functools
import hashlib
import io
import pathlib
import shutil
import urllib.request
import zipfile

URL = 'https://zenodo.org/record/841982/files/xflr6/portmanteaus-v1.0.zip'

CSV = pathlib.Path('esm3-analyses.csv')


def sha256sum(filename, bufsize=32768):
    s = hashlib.sha256()
    with io.open(filename, 'rb') as fd:
        for data in iter(functools.partial(fd.read, bufsize), b''):
            s.update(data)
    return s.hexdigest()


if not CSV.exists():
    with io.BytesIO() as b:
        with urllib.request.urlopen(URL) as u:
            shutil.copyfileobj(u, b)
        with zipfile.ZipFile(b) as z:
            i, = (i for i in z.infolist() if i.filename.endswith(CSV.name))
            i.filename = CSV.name
            z.extract(i)

sha256sum(CSV)


# In[2]:


get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats


def pearsonr(df, left, right, func=scipy.stats.pearsonr):
    df = df[[left, right]].dropna()
    name = f'{left} & {right}'
    with np.errstate(invalid='ignore'):
        result = func(df[left], df[right])
    return pd.Series(result, index=('r', 'p'), name=name)


plt.style.use('classic')
plt.rcParams.update({'figure.figsize': (6, 4), 'figure.facecolor': 'w',
                     'figure.subplot.bottom': .125, 'font.size': 10, 'savefig.dpi': 72})


# In[3]:


RENAME = {'Quechua (Ayacucho)': 'Ayacucho', 'Tlachichilco Tepehuan': 'Tepehua', 'Lakhota': 'Lakota'}


cf = pd.read_csv(CSV, encoding='utf-8')

cf['Language'] = cf['Language'].replace(RENAME)
cf = cf.sort_values(by='Language', kind='mergesort').reset_index(drop=True)

cf.info()
assert cf.set_index(['Language', 'Cell', 'Position']).index.is_unique
cf.head(10)


# ## Reconcatenate word-forms with stem symbol

# In[4]:


STEM = 'Σ'


assert not cf['Form'].str.contains(STEM).any()

_cf = (cf.drop('Meaning', axis=1)
       .assign(cell_index=lambda x: x.groupby(['Language', 'Cell'], sort=False).ngroup()))

_sf = (_cf.drop_duplicates('cell_index')
       .assign(Position=0, Form=STEM))

df = (pd.concat([_cf, _sf])
      .sort_values(by=['cell_index', 'Position'])
      .groupby(['cell_index', 'Language', 'Cell'])[['Form']]
      .agg(''.join)
      .reset_index('cell_index', drop=True))

df.info()
assert df.index.is_unique
df.head(10)


# ## Tag cells as 1/2&lt;-&gt;1/2, 1/2&lt;-&gt;3, and other

# In[5]:


SEP = '->'


def is_distinct_local(cellkey, sep=SEP, persons=('1', '2')):
    subj, trans, obj = cellkey.partition(sep)
    local_subj, local_obj = (any(p in arg for p in persons) for arg in (subj, obj))
    if local_subj and local_obj:
        return True
    elif trans and (local_subj or local_obj):
        return False
    else:
        return None


df.insert(0, 'd_local', df.index.get_level_values('Cell').map(is_distinct_local))

df.info()
assert df.index.is_unique
df.head(30)


# ## Neutralization ratios (1/2&lt;-&gt;3 vs. 1/2&lt;-&gt;1/2)

# In[6]:


xf = (df.dropna(subset=['d_local'])
      .groupby(['Language', 'd_local'])['Form']
      .agg(['size', 'nunique']))

xf['neut'] = xf['size'] - xf['nunique']
xf['ratio'] = 100 * xf['neut'] / (xf['size'] - 1)
xf['ratio (norm)'] = xf['ratio'] / xf['ratio'].groupby(level='Language').sum()
xf.loc[xf['ratio (norm)'].isnull(), 'ratio'] = None

xfp = xf.reset_index('d_local')
xf = xf.unstack()

xf


# In[7]:


(xf.sort_values(by=('ratio (norm)', True), kind='mergesort')['ratio']
 .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray));


# In[8]:


(xf['ratio (norm)'].sort_values(by=True, kind='mergesort')
 .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray));


# ## Test for neutralization differences

# In[9]:


pd.DataFrame([pearsonr(xfp, 'd_local', c) for c in ['ratio', 'ratio (norm)']])


# ## Count 1/2 subcategory (number, gender) neutralizations

# In[10]:


ff = df.dropna(subset=['d_local']).reset_index()
ff[['A', 'P']] = ff['Cell'].str.partition(SEP)[[0, 2]]
print(pd.concat([ff['A'], ff['P']]).unique())

a_first, p_first = (ff[x].str.contains(r'1') for x in ('A', 'P'))
a_second, p_second = (ff[x].str.contains(r'2|[dp]i') for x in ('A', 'P'))
a_third, p_third = ~a_first & ~a_second, ~p_first & ~p_second

# treat inclusive cells as first person only
a_second &= ~a_first; p_second &= ~p_first
assert (pd.concat([a_first, a_second, a_third], axis=1).sum(axis=1) == 1).all()
assert (pd.concat([p_first, p_second, p_third], axis=1).sum(axis=1) == 1).all()

groups = {'1->X:P': a_first & p_third,
          '2->X:P': a_second & p_third,
          'X->1:A': a_third & p_first,
          'X->2:A': a_third & p_second,
          #
          '1->2:A': a_first & p_second,
          '2->1:A': a_second & p_first,
          '1->2:P': a_first & p_second,
          '2->1:P': a_second & p_first}

lf = (pd.concat([ff[c].groupby(['Language', 'd_local', g.rpartition(':')[-1]])['Form']
                 .agg(['size', 'nunique'])
                 .assign(group=g)
                 .set_index('group', append=True)
                 .swaplevel()
                 .reset_index('d_local')
                 for g, c in groups.items()])
      .sort_index())
lf.index.rename('X', level=2, inplace=True)

lf['neut'] = lf['size'] - lf['nunique']
lf['ratio'] = 100 * lf['neut'] / (lf['size'] - 1)

lf.head(14)


# ## 1/2 subcategory neutralization ratios (1/2&lt;-&gt;3 vs. 1/2&lt;-&gt;1/2)

# In[11]:


rf = lf.pivot_table('ratio', ['Language', 'd_local'], aggfunc='mean')

rf['ratio (norm)'] = rf['ratio'] /  rf['ratio'].groupby(level='Language').sum()
rf.loc[rf['ratio (norm)'].isnull(), 'ratio'] = None

rfp = rf.reset_index('d_local')
rf = rf.unstack()

rf


# In[12]:


(rf.sort_values(by=('ratio (norm)', True), kind='mergesort')['ratio']
 .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray));


# In[13]:


(rf['ratio (norm)'].sort_values(by=True, kind='mergesort')
 .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray));


# ## Test for 1/2 subcategory neutralization differences

# In[14]:


pd.DataFrame([pearsonr(rfp, 'd_local', c) for c in ['ratio', 'ratio (norm)']])


# ## Absense of non-person features in learned meanings

# In[15]:


cf['Meaning'].str.extractall(r'([+-]\w+)')[0].value_counts().to_frame('n')


# In[16]:


NONPERSON = r'[+-]\D+\b'

nf = cf.assign(PersonOnly=lambda x: ~x['Meaning'].str.contains(NONPERSON))

nf.head(14)


# In[17]:


cnf = (nf.groupby(['Language', 'Cell'], sort=False)['PersonOnly'].all()
       .to_frame('PersonOnly'))

cnf.insert(0, 'd_local', cnf.index.get_level_values('Cell').map(is_distinct_local))

cnf.head(14)


# In[18]:


xnf = (cnf.dropna(subset=['d_local'])
       .groupby(['Language', 'd_local']).mean()
       .rename(columns={'PersonOnly': 'ratio'}))

xnf['ratio (norm)'] = xnf['ratio'] / xnf['ratio'].groupby(level='Language').sum()
xnf.loc[xnf['ratio (norm)'].isnull(), 'ratio'] = None

xnfp = xnf.reset_index('d_local')
xnf = xnf.unstack()

xnf


# In[19]:


(xnf['ratio (norm)'].sort_values(by=True, kind='mergesort')
 .plot.bar(stacked=True, figsize=(15, 5), cmap=plt.cm.gray));


# In[20]:


pd.DataFrame([pearsonr(xnfp, 'd_local', c) for c in ['ratio', 'ratio (norm)']])