import re
import os
import nltk
raw = open('gadsby_full_lower.txt', 'r').read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)

# summary statistics
print ' '
print "Characters: {}".format(len(raw))
print "Tokens: {}".format(len(tokens))
print "Unique tokens: {}".format(len(set(tokens)))
print "Lexical diversity: {:.3f}".format(len(set(tokens))*1.0/len(tokens))

# create frequency distribution dataframe
fdist = nltk.FreqDist(text)
import pandas as pd
df = pd.DataFrame({'token': fdist.keys(), 'freq':fdist.values()})
df = df[~df.token.str.contains('[^A-Za-z]')] # remove tokens with non-alphabetic characters
df.columns = ['freq', 'word']
print ' '
print df.head()

#sanity check
print df[df.word.str.contains('e')]

if not os.path.isfile('brown_df.pickle'):
    print "Processing Brown corpus from NLTK"
    from nltk.corpus import brown
    categories = []
    words = []
    frequencies = []
    for category in brown.categories():
        wordlist = brown.words(categories=category)
        freqs = nltk.FreqDist([w.lower() for w in wordlist])
        for key in freqs.keys():
            categories.append(category)
            words.append(key)
            frequencies.append(freqs[key])
    brown_df = pd.DataFrame({'word':words, 'freq':frequencies, 'category':categories})
    brown_df['nonalpha'] = False
    brown_df['nonalpha'][brown_df.word.str.contains('[^A-Za-z]')] = True
    brown_df.to_pickle('brown_df.pickle')
else:
    print "Reading brown_df.pickle"
    brown_df = pd.read_pickle('brown_df.pickle')

if not os.path.isfile('brown_non_e.pickle'):
    print 'Creating dataframe.'
    total_freq = 0
    weighted_length = 0
    brown_words = brown_df[brown_df.nonalpha == False]
    brown_words = brown_words[~brown_words.word.str.contains('e')]
    brown_words = pd.DataFrame(brown_words.groupby(['word']).sum()).reset_index(drop=False)
    brown_words['length'] = 0
    brown_words.sort('freq', ascending = False, inplace=True)
    total_freq = brown_words.freq.sum()
    median_counter = total_freq / 2
    median_found = False
    for idx, row in brown_words.iterrows():
        curr_len = len(brown_words.word[idx])
        brown_words.length[idx] = curr_len
        weighted_length += brown_words.freq[idx] * curr_len
        median_counter -= curr_len * brown_words.freq[idx]
        if median_counter < 0 and median_found == False:
            wt_median_len = curr_len
            median_found = True
    brown_words = brown_words[['word', 'freq', 'length']]
    brown_words.to_pickle('brown_non_e.pickle')
    
else:
    print 'Reading pickle.'
    brown_words = pd.read_pickle('brown_non_e.pickle')

def loglike(n1, t1, n2, t2):
    """Calculates Dunning log likelihood of an observation of 
    frequency n1 in a corpus of size t1, compared to a frequency n2 
    in a corpus of size t2. If result is positive, it is more 
    likely to occur in corpus 1, otherwise in corpus 2."""
    e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
    e2 = t2*1.0*(n1+n2)/(t1+t2)
    LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
    if n2*1.0/t2 > n1*1.0/t1:
        LL = -LL
    return LL

from numpy import log

t1 = df.freq.sum()
t2 = brown_words.freq.sum()
df['log_likelihood'] = 0.0

for i in range(len(df)):
    word = df.word.iloc[i]
    n1 = df.freq.iloc[i]
    fnd = brown_words[brown_words.word == word]
    if len(fnd) > 0:
        n2 = fnd.freq.iloc[0]
    else:
        n2 = 0
    df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)

print df.sort('log_likelihood', ascending=False).head(20)

print df.sort('log_likelihood', ascending=True).head(20)

import time
import numpy as np
from nltk.corpus import brown
br_trig = nltk.trigrams(brown.words())

if 2>1: #not os.path.isfile('gadsby_analysis.pickle'):
    start = time.time()
    print "Building dataframe."

    freq_g = df.freq.sum()
    freq_b = brown_words.freq.sum()

    df['brown_freq_normalized'] = 0.0
    for i in range(len(df)):
        word = df.word.iloc[i]
        try:
            brown_freq = brown_words[brown_words.word == word].freq.iloc[0]
        except:
            brown_freq = 0
        try:
            df.brown_freq_normalized.iloc[i] = brown_freq * freq_g / freq_b
        except:
            df.brown_freq_normalized.iloc[i] = np.nan

    df['diff_absolute'] = df.freq - df.brown_freq_normalized
    df['diff_relative'] = df.freq*1.0/df.brown_freq_normalized
    
    wprev = []
    word = []
    wnext = []
    for item in br_trig:
        a = item[0].lower()
        b = item[1].lower()
        c = item[2].lower()
        if (re.search('[a-z]', a) and
            re.search('[a-z]', b) and
            re.search('[a-z]', c) ):
            wprev.append(a)
            word.append(b)
            wnext.append(c)

    tri = pd.DataFrame({'wprev': wprev,
                        'word': word,
                        'wnext': wnext})

    def calc_pcte(row):
        dftemp = tri[tri.word == row.word]
        total = len(dftemp)
        dftempe = dftemp[(dftemp.wprev.str.contains('e') ) | (dftemp.wnext.str.contains('e'))]
        try:
            return len(dftempe) * 100.0 / total
        except:
            return np.nan

    def calc_pctthe(row):
        dftemp = tri[tri.word == row.word]
        total = len(dftemp)
        dftempthe = dftemp[(dftemp.wprev == 'the' ) | (dftemp.wnext == 'the')]
        try:
            return len(dftempthe) * 100.0 / total
        except:
            return np.nan

    df['pct_e'] = df.apply(calc_pcte, axis=1)
    df['pct_the'] = df.apply(calc_pctthe, axis=1)

    df.to_pickle('gadsby_analysis.pickle')
    df.to_csv('gadsby_analysis.csv')
    print "Done. {} minutes elapsed.".format(round((time.time() - start) / 60, 1))
    
else:
    print "Reading pickle."
    df = pd.read_pickle('gadsby_analysis.pickle')

from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import seaborn

plt.figure(figsize=(9,6))

dftemp = df[['word', 'log_likelihood', 'pct_e', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood > -50) & (dftemp.log_likelihood < 50)]
dftemp = dftemp[(dftemp.freq > 10)]

plt.scatter(dftemp.log_likelihood, dftemp.pct_e, s=dftemp.freq*1.5, marker='o', 
            color='blue', alpha=0.15, label='data')
plt.title(' ')
#plt.xlim(-10,10)
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighbouring 'e' in Brown corpus")

plt.show()


df

from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np

# x1_samples.shape -> (100, 2), 100 rows, 2 columns

plt.figure(figsize=(8,6))

dftemp = df[['word', 'log_likelihood', 'pct_the', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood < -100) | (dftemp.log_likelihood > 100)]

plt.scatter(dftemp.log_likelihood, dftemp.pct_the, s=dftemp.freq, marker='o', 
            color='blue', alpha=0.4, label='data')
#plt.scatter(x2_samples[:,0], x1_samples[:,1], marker='o', 
 #           color='green', alpha=0.7, label='x2 samples')
#plt.scatter(x3_samples[:,0], x1_samples[:,1], marker='^', 
 #           color='red', alpha=0.7, label='x3 samples')
plt.title(' ')
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighboring 'the' in Brown corpus")