import re import os import nltk raw = open('gadsby_full_lower.txt', 'r').read() tokens = nltk.word_tokenize(raw) text = nltk.Text(tokens) # summary statistics print ' ' print "Characters: {}".format(len(raw)) print "Tokens: {}".format(len(tokens)) print "Unique tokens: {}".format(len(set(tokens))) print "Lexical diversity: {:.3f}".format(len(set(tokens))*1.0/len(tokens)) # create frequency distribution dataframe fdist = nltk.FreqDist(text) import pandas as pd df = pd.DataFrame({'token': fdist.keys(), 'freq':fdist.values()}) df = df[~df.token.str.contains('[^A-Za-z]')] # remove tokens with non-alphabetic characters df.columns = ['freq', 'word'] print ' ' print df.head() #sanity check print df[df.word.str.contains('e')] if not os.path.isfile('brown_df.pickle'): print "Processing Brown corpus from NLTK" from nltk.corpus import brown categories = [] words = [] frequencies = [] for category in brown.categories(): wordlist = brown.words(categories=category) freqs = nltk.FreqDist([w.lower() for w in wordlist]) for key in freqs.keys(): categories.append(category) words.append(key) frequencies.append(freqs[key]) brown_df = pd.DataFrame({'word':words, 'freq':frequencies, 'category':categories}) brown_df['nonalpha'] = False brown_df['nonalpha'][brown_df.word.str.contains('[^A-Za-z]')] = True brown_df.to_pickle('brown_df.pickle') else: print "Reading brown_df.pickle" brown_df = pd.read_pickle('brown_df.pickle') if not os.path.isfile('brown_non_e.pickle'): print 'Creating dataframe.' total_freq = 0 weighted_length = 0 brown_words = brown_df[brown_df.nonalpha == False] brown_words = brown_words[~brown_words.word.str.contains('e')] brown_words = pd.DataFrame(brown_words.groupby(['word']).sum()).reset_index(drop=False) brown_words['length'] = 0 brown_words.sort('freq', ascending = False, inplace=True) total_freq = brown_words.freq.sum() median_counter = total_freq / 2 median_found = False for idx, row in brown_words.iterrows(): curr_len = len(brown_words.word[idx]) brown_words.length[idx] = curr_len weighted_length += brown_words.freq[idx] * curr_len median_counter -= curr_len * brown_words.freq[idx] if median_counter < 0 and median_found == False: wt_median_len = curr_len median_found = True brown_words = brown_words[['word', 'freq', 'length']] brown_words.to_pickle('brown_non_e.pickle') else: print 'Reading pickle.' brown_words = pd.read_pickle('brown_non_e.pickle') def loglike(n1, t1, n2, t2): """Calculates Dunning log likelihood of an observation of frequency n1 in a corpus of size t1, compared to a frequency n2 in a corpus of size t2. If result is positive, it is more likely to occur in corpus 1, otherwise in corpus 2.""" e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values e2 = t2*1.0*(n1+n2)/(t1+t2) LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2))) if n2*1.0/t2 > n1*1.0/t1: LL = -LL return LL from numpy import log t1 = df.freq.sum() t2 = brown_words.freq.sum() df['log_likelihood'] = 0.0 for i in range(len(df)): word = df.word.iloc[i] n1 = df.freq.iloc[i] fnd = brown_words[brown_words.word == word] if len(fnd) > 0: n2 = fnd.freq.iloc[0] else: n2 = 0 df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2) print df.sort('log_likelihood', ascending=False).head(20) print df.sort('log_likelihood', ascending=True).head(20) import time import numpy as np from nltk.corpus import brown br_trig = nltk.trigrams(brown.words()) if 2>1: #not os.path.isfile('gadsby_analysis.pickle'): start = time.time() print "Building dataframe." freq_g = df.freq.sum() freq_b = brown_words.freq.sum() df['brown_freq_normalized'] = 0.0 for i in range(len(df)): word = df.word.iloc[i] try: brown_freq = brown_words[brown_words.word == word].freq.iloc[0] except: brown_freq = 0 try: df.brown_freq_normalized.iloc[i] = brown_freq * freq_g / freq_b except: df.brown_freq_normalized.iloc[i] = np.nan df['diff_absolute'] = df.freq - df.brown_freq_normalized df['diff_relative'] = df.freq*1.0/df.brown_freq_normalized wprev = [] word = [] wnext = [] for item in br_trig: a = item[0].lower() b = item[1].lower() c = item[2].lower() if (re.search('[a-z]', a) and re.search('[a-z]', b) and re.search('[a-z]', c) ): wprev.append(a) word.append(b) wnext.append(c) tri = pd.DataFrame({'wprev': wprev, 'word': word, 'wnext': wnext}) def calc_pcte(row): dftemp = tri[tri.word == row.word] total = len(dftemp) dftempe = dftemp[(dftemp.wprev.str.contains('e') ) | (dftemp.wnext.str.contains('e'))] try: return len(dftempe) * 100.0 / total except: return np.nan def calc_pctthe(row): dftemp = tri[tri.word == row.word] total = len(dftemp) dftempthe = dftemp[(dftemp.wprev == 'the' ) | (dftemp.wnext == 'the')] try: return len(dftempthe) * 100.0 / total except: return np.nan df['pct_e'] = df.apply(calc_pcte, axis=1) df['pct_the'] = df.apply(calc_pctthe, axis=1) df.to_pickle('gadsby_analysis.pickle') df.to_csv('gadsby_analysis.csv') print "Done. {} minutes elapsed.".format(round((time.time() - start) / 60, 1)) else: print "Reading pickle." df = pd.read_pickle('gadsby_analysis.pickle') from matplotlib import pyplot as plt %matplotlib inline import numpy as np import seaborn plt.figure(figsize=(9,6)) dftemp = df[['word', 'log_likelihood', 'pct_e', 'freq']].dropna() #dftemp = dftemp[(dftemp.log_likelihood > -50) & (dftemp.log_likelihood < 50)] dftemp = dftemp[(dftemp.freq > 10)] plt.scatter(dftemp.log_likelihood, dftemp.pct_e, s=dftemp.freq*1.5, marker='o', color='blue', alpha=0.15, label='data') plt.title(' ') #plt.xlim(-10,10) plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby') plt.ylabel("Percent probability of neighbouring 'e' in Brown corpus") plt.show() df from matplotlib import pyplot as plt %matplotlib inline import numpy as np # x1_samples.shape -> (100, 2), 100 rows, 2 columns plt.figure(figsize=(8,6)) dftemp = df[['word', 'log_likelihood', 'pct_the', 'freq']].dropna() #dftemp = dftemp[(dftemp.log_likelihood < -100) | (dftemp.log_likelihood > 100)] plt.scatter(dftemp.log_likelihood, dftemp.pct_the, s=dftemp.freq, marker='o', color='blue', alpha=0.4, label='data') #plt.scatter(x2_samples[:,0], x1_samples[:,1], marker='o', # color='green', alpha=0.7, label='x2 samples') #plt.scatter(x3_samples[:,0], x1_samples[:,1], marker='^', # color='red', alpha=0.7, label='x3 samples') plt.title(' ') plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby') plt.ylabel("Percent probability of neighboring 'the' in Brown corpus")