#!/usr/bin/env python # coding: utf-8 # In[1]: # Imports import os import string import math import re from collections import Counter from pprint import pprint import html import numpy as np from sklearn.feature_extraction.text import CountVectorizer from cltk.corpus.latin import latinlibrary from cltk.tokenize.word import WordTokenizer from cltk.stem.latin.j_v import JVReplacer # In[2]: # Setup CLTK tools word_tokenizer = WordTokenizer('latin') replacer = JVReplacer() # In[3]: # Setup files files = latinlibrary.fileids() print("There are %d files in the Latin Library corpus." % len(files)) # In[4]: # Typical setup files = [file for file in files] # Filter for classical texts #classical = [] #remove = ["The Bible","Ius Romanum","Papal Bulls","Medieval Latin","Christian Latin","Christina Latin","Neo-Latin","The Miscellany","Contemporary Latin"] #for file in files: # raw = latinlibrary.raw(file) # if not any(x in raw for x in remove): # classical.append(file) #files = classical #print("There are %d files in the Latin Library Classical subcorpus." % len(files)) # In[5]: # Preprocess texts def preprocess(text): text = html.unescape(text) # Handle html entities text = re.sub(r' ?', ' ',text) #  stripped incorrectly in corpus? text = re.sub('\x00',' ',text) #Another space problem? text = text.lower() text = replacer.replace(text) #Normalize u/v & i/j punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»" translator = str.maketrans({key: " " for key in punctuation}) text = text.translate(translator) translator = str.maketrans({key: " " for key in '0123456789'}) text = text.translate(translator) remove_list = [r'\bthe latin library\b', r'\bthe classics page\b', r'\bneo-latin\b', r'\bmedieval latin\b', r'\bchristian latin\b', r'\bchristina latin\b', r'\bpapal bulls\b', r'\bthe miscellany\b', ] for pattern in remove_list: text = re.sub(pattern, '', text) text = re.sub('[ ]+',' ', text) # Remove double spaces text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines return text # In[6]: # Make list of texts raw_files = [] for file in files: raw = latinlibrary.raw(file) raw = preprocess(raw) if len(raw) < 1000: pass else: raw_tokens = raw.split() raw = " ".join(raw_tokens[50:-50]) raw_files.append(raw) # In[7]: tokens = [file.split() for file in raw_files] tokens = [val for sublist in tokens for val in sublist] print(len(tokens)) print(len(set(tokens))) rank = Counter(tokens) print(rank.most_common(25)) hapax = len([x for x in tokens if rank[x] == 1]) print(hapax) # ### Following [Alajmi 2012] # In[8]: # See also Zou et al. 2006 # Make document-term matrix and vocabulary vectorizer = CountVectorizer(input='content', min_df=2) dtm = vectorizer.fit_transform(raw_files) dtm = dtm.toarray() vocab = vectorizer.get_feature_names() vocab = np.array(vocab) # In[9]: M = len(vocab) N= len(raw_files) # In[10]: # Make array of probabilities per book raw_lengths = [len(tokens.split()) for tokens in raw_files] l = np.array(raw_lengths) ll = l.reshape(len(l),1) probs = dtm/ll P=probs # In[11]: # Calculate mean probability # i.e. Sum of probabilities for each word / number of documents probsum = np.ravel(probs.sum(axis=0)) MP = probsum/N # In[12]: # Make array of bar probability length = sum(raw_lengths) barprobs = dtm/length bP=barprobs # In[13]: variance = (P-bP) ** 2 varsum = np.ravel(variance.sum(axis=0)) VP = varsum/N # In[14]: cutoff = 100 # In[15]: # Return top counts freq = np.ravel(dtm.sum(axis=0)) wordfreq = list(zip(vocab,freq)) wordfreq.sort(key=lambda x: x[1], reverse=True) wf = [item[0] for item in wordfreq] wf = wf[:cutoff] print(wf) # In[16]: # Return top mean prob test = list(zip(vocab,MP)) test.sort(key=lambda x: x[1], reverse=True) mp = [item[0] for item in test] mp = mp[:cutoff] print(mp) # In[17]: # Return top variance prob test = list(zip(vocab,VP)) test.sort(key=lambda x: x[1], reverse=True) vp = [item[0] for item in test] vp = vp[:cutoff] print(vp) # In[18]: with np.errstate(divide='ignore', invalid='ignore'): logprobs = np.where(probs != 0, np.log10(1/probs), 0) ent = probs * logprobs # In[19]: ents = np.ravel(ent.sum(axis=0)) entrank = list(zip(vocab,ents)) entrank.sort(key=lambda x: x[1], reverse=True) e = [item[0] for item in entrank] e = e[:cutoff] print(e) # In[20]: def borda_sort(lists): ### From http://stackoverflow.com/a/30259368/1816347 ### scores = {} for l in lists: for idx, elem in enumerate(reversed(l)): if not elem in scores: scores[elem] = 0 scores[elem] += idx return sorted(scores.keys(), key=lambda elem: scores[elem], reverse=True) # In[21]: lists = [wf, mp, vp, e] borda = borda_sort(lists) print(borda[:100]) # ### Other Latin stopword lists # In[22]: tesserae = ['qui', 'quis', 'et', 'sum', 'in', 'is', 'non', 'hic', 'ego', 'ut'] # In[23]: # Cf. http://www.perseus.tufts.edu/hopper/stopwords # Same as the list w. the following: # from cltk.stop.latin.stops import STOPS_LIST perseus = ['ab', 'ac', 'ad', 'adhic', 'aliqui', 'aliquis', 'an', 'ante', 'apud', 'at', 'atque', 'aut', 'autem', 'cum', 'cur', 'de', 'deinde', 'dum', 'ego', 'enim', 'ergo', 'es', 'est', 'et', 'etiam', 'etsi', 'ex', 'fio', 'haud', 'hic', 'iam', 'idem', 'igitur', 'ille', 'in', 'infra', 'inter', 'interim', 'ipse', 'is', 'ita', 'magis', 'modo', 'mox', 'nam', 'ne', 'nec', 'necque', 'neque', 'nisi', 'non', 'nos', 'o', 'ob', 'per', 'possum', 'post', 'pro', 'quae', 'quam', 'quare', 'qui', 'quia', 'quicumque', 'quidem', 'quilibet', 'quis', 'quisnam', 'quisquam', 'quisque', 'quisquis', 'quo', 'quoniam', 'sed', 'si', 'sic', 'sive', 'sub', 'sui', 'sum', 'super', 'suus', 'tam', 'tamen', 'trans', 'tu', 'tum', 'ubi', 'uel', 'uero', 'unus', 'ut'] perseus = [replacer.replace(word) for word in perseus] # In[24]: pprint(list(set(perseus) - set(borda))) pprint(list(set(borda) - set(perseus))) # In[25]: pprint(list(set(tesserae) - set(borda))) pprint(list(set(borda) - set(tesserae))) # ### References # - Alajmi, A., Saad, E.M., and R.R. Darwish. 2012. "Toward an Arabic Stop-Words List Generation," *International Journal of Computer Applications* 48(8): 8-13. # - Zou, F., F. L. Wang, X. Deng, S. Han, and L. S. Wang. 2006. “Automatic Construction of Chinese Stop Word List.” In Proceedings of the 5th WSEAS International Conference on Applied Computer Science, 1010–1015.