#!/usr/bin/env python
# coding: utf-8

# In[1]:


# Imports

import os
import string
import math
import re
from collections import Counter
from pprint import pprint
import html    

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from cltk.corpus.latin import latinlibrary
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer


# In[2]:


# Setup CLTK tools

word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()


# In[3]:


# Setup files

files = latinlibrary.fileids()
print("There are %d files in the Latin Library corpus." % len(files))


# In[4]:


# Typical setup
files = [file for file in files]

# Filter for classical texts
#classical = []

#remove = ["The Bible","Ius Romanum","Papal Bulls","Medieval Latin","Christian Latin","Christina Latin","Neo-Latin","The Miscellany","Contemporary Latin"]

#for file in files:
#    raw = latinlibrary.raw(file)
#    if not any(x in raw for x in remove):
#        classical.append(file)

#files = classical
#print("There are %d files in the Latin Library Classical subcorpus." % len(files))


# In[5]:


# Preprocess texts

def preprocess(text):    

    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub('\x00',' ',text) #Another space problem?
    
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    remove_list = [r'\bthe latin library\b',
                   r'\bthe classics page\b',
                   r'\bneo-latin\b', 
                   r'\bmedieval latin\b',
                   r'\bchristian latin\b',
                   r'\bchristina latin\b',
                   r'\bpapal bulls\b',
                   r'\bthe miscellany\b',
                  ]

    for pattern in remove_list:
        text = re.sub(pattern, '', text)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    
    return text


# In[6]:


# Make list of texts

raw_files = []

for file in files:
    raw = latinlibrary.raw(file)
    raw = preprocess(raw)
    if len(raw) < 1000:
        pass
    else:
        raw_tokens = raw.split()
        raw = " ".join(raw_tokens[50:-50])
        raw_files.append(raw)


# In[7]:


tokens = [file.split() for file in raw_files]
tokens = [val for sublist in tokens for val in sublist]
print(len(tokens))
print(len(set(tokens)))
rank = Counter(tokens)
print(rank.most_common(25))
hapax = len([x for x in tokens if rank[x] == 1])
print(hapax)


# ### Following [Alajmi 2012]

# In[8]:


# See also Zou et al. 2006

# Make document-term matrix and vocabulary

vectorizer = CountVectorizer(input='content', min_df=2)
dtm = vectorizer.fit_transform(raw_files)
dtm = dtm.toarray()

vocab = vectorizer.get_feature_names()
vocab = np.array(vocab)


# In[9]:


M = len(vocab)
N= len(raw_files)


# In[10]:


# Make array of probabilities per book

raw_lengths = [len(tokens.split()) for tokens in raw_files]
l = np.array(raw_lengths)
ll = l.reshape(len(l),1)

probs = dtm/ll

P=probs


# In[11]:


# Calculate mean probability
# i.e. Sum of probabilities for each word / number of documents

probsum = np.ravel(probs.sum(axis=0))
MP = probsum/N


# In[12]:


# Make array of bar probability

length = sum(raw_lengths)
barprobs = dtm/length
bP=barprobs


# In[13]:


variance = (P-bP) ** 2
varsum = np.ravel(variance.sum(axis=0))
VP = varsum/N


# In[14]:


cutoff = 100


# In[15]:


# Return top counts

freq = np.ravel(dtm.sum(axis=0))
wordfreq = list(zip(vocab,freq))
wordfreq.sort(key=lambda x: x[1], reverse=True)
wf = [item[0] for item in wordfreq]
wf = wf[:cutoff]
print(wf)


# In[16]:


# Return top mean prob

test = list(zip(vocab,MP))
test.sort(key=lambda x: x[1], reverse=True)
mp = [item[0] for item in test]
mp = mp[:cutoff]
print(mp)


# In[17]:


# Return top variance prob

test = list(zip(vocab,VP))
test.sort(key=lambda x: x[1], reverse=True)
vp = [item[0] for item in test]
vp = vp[:cutoff]
print(vp)


# In[18]:


with np.errstate(divide='ignore', invalid='ignore'):
    logprobs = np.where(probs != 0, np.log10(1/probs), 0)
ent = probs * logprobs


# In[19]:


ents = np.ravel(ent.sum(axis=0))
entrank = list(zip(vocab,ents))
entrank.sort(key=lambda x: x[1], reverse=True)
e = [item[0] for item in entrank]
e = e[:cutoff]
print(e)


# In[20]:


def borda_sort(lists):
    ### From http://stackoverflow.com/a/30259368/1816347 ###
    scores = {}
    for l in lists:
        for idx, elem in enumerate(reversed(l)):
            if not elem in scores:
                scores[elem] = 0
            scores[elem] += idx
    return sorted(scores.keys(), key=lambda elem: scores[elem], reverse=True)


# In[21]:


lists = [wf, mp, vp, e]
borda = borda_sort(lists)

print(borda[:100])


# ### Other Latin stopword lists

# In[22]:


tesserae = ['qui', 'quis', 'et', 'sum', 'in', 'is', 'non', 'hic', 'ego', 'ut']


# In[23]:


# Cf. http://www.perseus.tufts.edu/hopper/stopwords
# Same as the list w. the following:
# from cltk.stop.latin.stops import STOPS_LIST
perseus = ['ab', 'ac', 'ad', 'adhic', 'aliqui', 'aliquis', 'an', 'ante', 'apud', 'at', 'atque', 'aut', 'autem', 'cum', 'cur', 'de', 'deinde', 'dum', 'ego', 'enim', 'ergo', 'es', 'est', 'et', 'etiam', 'etsi', 'ex', 'fio', 'haud', 'hic', 'iam', 'idem', 'igitur', 'ille', 'in', 'infra', 'inter', 'interim', 'ipse', 'is', 'ita', 'magis', 'modo', 'mox', 'nam', 'ne', 'nec', 'necque', 'neque', 'nisi', 'non', 'nos', 'o', 'ob', 'per', 'possum', 'post', 'pro', 'quae', 'quam', 'quare', 'qui', 'quia', 'quicumque', 'quidem', 'quilibet', 'quis', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quo', 'quoniam', 'sed', 'si', 'sic', 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero', 'unus', 'ut']
perseus = [replacer.replace(word) for word in perseus]


# In[24]:


pprint(list(set(perseus) - set(borda)))
pprint(list(set(borda) - set(perseus)))


# In[25]:


pprint(list(set(tesserae) - set(borda)))
pprint(list(set(borda) - set(tesserae)))


# ### References

# - Alajmi, A., Saad, E.M., and R.R. Darwish. 2012. "Toward an Arabic Stop-Words List Generation," *International Journal of Computer Applications* 48(8): 8-13.
# - Zou, F., F. L. Wang, X. Deng, S. Han, and L. S. Wang. 2006. “Automatic Construction of Chinese Stop Word List.” In Proceedings of the 5th WSEAS International Conference on Applied Computer Science, 1010–1015.