Load Gadsby and create dataframe of frequency distribution¶

In [1]:

import re
import os
import nltk
raw = open('gadsby_full_lower.txt', 'r').read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)

# summary statistics
print ' '
print "Characters: {}".format(len(raw))
print "Tokens: {}".format(len(tokens))
print "Unique tokens: {}".format(len(set(tokens)))
print "Lexical diversity: {:.3f}".format(len(set(tokens))*1.0/len(tokens))

# create frequency distribution dataframe
fdist = nltk.FreqDist(text)
import pandas as pd
df = pd.DataFrame({'token': fdist.keys(), 'freq':fdist.values()})
df = df[~df.token.str.contains('[^A-Za-z]')] # remove tokens with non-alphabetic characters
df.columns = ['freq', 'word']
print ' '
print df.head()

 
Characters: 277418
Tokens: 61759
Unique tokens: 5233
Lexical diversity: 0.085
 
   freq  word
1  2437     a
2  1669   and
4  1225  that
6  1162    of
8   934    in

In [2]:

#sanity check
print df[df.word.str.contains('e')]

Empty DataFrame
Columns: [freq, word]
Index: []

Make frequency dataframe of Brown corpus words not containing 'e'¶

In [3]:

if not os.path.isfile('brown_df.pickle'):
    print "Processing Brown corpus from NLTK"
    from nltk.corpus import brown
    categories = []
    words = []
    frequencies = []
    for category in brown.categories():
        wordlist = brown.words(categories=category)
        freqs = nltk.FreqDist([w.lower() for w in wordlist])
        for key in freqs.keys():
            categories.append(category)
            words.append(key)
            frequencies.append(freqs[key])
    brown_df = pd.DataFrame({'word':words, 'freq':frequencies, 'category':categories})
    brown_df['nonalpha'] = False
    brown_df['nonalpha'][brown_df.word.str.contains('[^A-Za-z]')] = True
    brown_df.to_pickle('brown_df.pickle')
else:
    print "Reading brown_df.pickle"
    brown_df = pd.read_pickle('brown_df.pickle')

Reading brown_df.pickle

In [4]:

if not os.path.isfile('brown_non_e.pickle'):
    print 'Creating dataframe.'
    total_freq = 0
    weighted_length = 0
    brown_words = brown_df[brown_df.nonalpha == False]
    brown_words = brown_words[~brown_words.word.str.contains('e')]
    brown_words = pd.DataFrame(brown_words.groupby(['word']).sum()).reset_index(drop=False)
    brown_words['length'] = 0
    brown_words.sort('freq', ascending = False, inplace=True)
    total_freq = brown_words.freq.sum()
    median_counter = total_freq / 2
    median_found = False
    for idx, row in brown_words.iterrows():
        curr_len = len(brown_words.word[idx])
        brown_words.length[idx] = curr_len
        weighted_length += brown_words.freq[idx] * curr_len
        median_counter -= curr_len * brown_words.freq[idx]
        if median_counter < 0 and median_found == False:
            wt_median_len = curr_len
            median_found = True
    brown_words = brown_words[['word', 'freq', 'length']]
    brown_words.to_pickle('brown_non_e.pickle')
    
else:
    print 'Reading pickle.'
    brown_words = pd.read_pickle('brown_non_e.pickle')

Reading pickle.

Determine log likelihood of word frequencies, comparing Gadsby to Brown-without-e¶

In [5]:

def loglike(n1, t1, n2, t2):
    """Calculates Dunning log likelihood of an observation of 
    frequency n1 in a corpus of size t1, compared to a frequency n2 
    in a corpus of size t2. If result is positive, it is more 
    likely to occur in corpus 1, otherwise in corpus 2."""
    e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
    e2 = t2*1.0*(n1+n2)/(t1+t2)
    LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
    if n2*1.0/t2 > n1*1.0/t1:
        LL = -LL
    return LL

In [6]:

from numpy import log

t1 = df.freq.sum()
t2 = brown_words.freq.sum()
df['log_likelihood'] = 0.0

for i in range(len(df)):
    word = df.word.iloc[i]
    n1 = df.freq.iloc[i]
    fnd = brown_words[brown_words.word == word]
    if len(fnd) > 0:
        n2 = fnd.freq.iloc[0]
    else:
        n2 = 0
    df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)

In [7]:

print df.sort('log_likelihood', ascending=False).head(20)

     freq        word  log_likelihood
54    200       hills      749.660964
32    297         big      630.151174
79    122       nancy      563.296979
71    138        lady      411.288896
68    140        bill      326.549776
108    80       folks      306.118227
95     97       honor      272.219017
59    187       young      271.193850
85    108       happy      267.283070
112    76       sarah      265.372978
44    237         old      253.839785
136    60    simpkins      253.691283
19    442          so      234.694974
47    229        know      226.664917
152    52  councilman      224.677398
176    46        nina      218.448458
87    106       girls      211.621005
126    66        tiny      177.231911
103    89          oh      175.102218
131    63       grand      168.765580

In [8]:

print df.sort('log_likelihood', ascending=True).head(20)

      freq     word  log_likelihood
6     1162       of    -1776.113782
9      921       to    -1112.090217
8      934       in     -616.702852
2     1669      and     -383.521998
78     124       by     -362.344996
18     471       is     -254.363544
22     403     with     -113.644883
25     383      his     -112.694084
96      96      him     -104.650941
104     86      has     -103.254890
13     585      for      -97.381118
35     284       at      -96.568517
24     384       on      -94.773170
228     34      may      -92.570425
30     301      had      -64.783953
4378     1  program      -60.080679
2736     1    among      -55.998265
247     30     must      -54.226455
41     259     from      -52.902273
33     291      not      -42.458054

Complete analysis dataframe¶

Add column of Brown non-e frequency normalized to Gadsby corpus length
Add absolute and relative frequency differences between Gasby and Brown non-e
Add frequency of neighboring 'the' or 'e'-containing word in original Brown corpus

Note: this takes a long time.

In [13]:

import time
import numpy as np
from nltk.corpus import brown
br_trig = nltk.trigrams(brown.words())

if 2>1: #not os.path.isfile('gadsby_analysis.pickle'):
    start = time.time()
    print "Building dataframe."

    freq_g = df.freq.sum()
    freq_b = brown_words.freq.sum()

    df['brown_freq_normalized'] = 0.0
    for i in range(len(df)):
        word = df.word.iloc[i]
        try:
            brown_freq = brown_words[brown_words.word == word].freq.iloc[0]
        except:
            brown_freq = 0
        try:
            df.brown_freq_normalized.iloc[i] = brown_freq * freq_g / freq_b
        except:
            df.brown_freq_normalized.iloc[i] = np.nan

    df['diff_absolute'] = df.freq - df.brown_freq_normalized
    df['diff_relative'] = df.freq*1.0/df.brown_freq_normalized
    
    wprev = []
    word = []
    wnext = []
    for item in br_trig:
        a = item[0].lower()
        b = item[1].lower()
        c = item[2].lower()
        if (re.search('[a-z]', a) and
            re.search('[a-z]', b) and
            re.search('[a-z]', c) ):
            wprev.append(a)
            word.append(b)
            wnext.append(c)

    tri = pd.DataFrame({'wprev': wprev,
                        'word': word,
                        'wnext': wnext})

    def calc_pcte(row):
        dftemp = tri[tri.word == row.word]
        total = len(dftemp)
        dftempe = dftemp[(dftemp.wprev.str.contains('e') ) | (dftemp.wnext.str.contains('e'))]
        try:
            return len(dftempe) * 100.0 / total
        except:
            return np.nan

    def calc_pctthe(row):
        dftemp = tri[tri.word == row.word]
        total = len(dftemp)
        dftempthe = dftemp[(dftemp.wprev == 'the' ) | (dftemp.wnext == 'the')]
        try:
            return len(dftempthe) * 100.0 / total
        except:
            return np.nan

    df['pct_e'] = df.apply(calc_pcte, axis=1)
    df['pct_the'] = df.apply(calc_pctthe, axis=1)

    df.to_pickle('gadsby_analysis.pickle')
    df.to_csv('gadsby_analysis.csv')
    print "Done. {} minutes elapsed.".format(round((time.time() - start) / 60, 1))
    
else:
    print "Reading pickle."
    df = pd.read_pickle('gadsby_analysis.pickle')

Building dataframe.
Done. 17.2 minutes elapsed.

Graph 'volcano plot' of

In [14]:

from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import seaborn

plt.figure(figsize=(9,6))

dftemp = df[['word', 'log_likelihood', 'pct_e', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood > -50) & (dftemp.log_likelihood < 50)]
dftemp = dftemp[(dftemp.freq > 10)]

plt.scatter(dftemp.log_likelihood, dftemp.pct_e, s=dftemp.freq*1.5, marker='o', 
            color='blue', alpha=0.15, label='data')
plt.title(' ')
#plt.xlim(-10,10)
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighbouring 'e' in Brown corpus")

plt.show()

In [12]:

df

Out[12]:

	freq	word	log_likelihood	brown_freq_normalized	diff_absolute	diff_relative
1	2437	a	39.837277	2125	312	1.146824
2	1669	and	-383.521998	2643	-974	0.631479
4	1225	that	55.933699	970	255	1.262887
6	1162	of	-1776.113782	3336	-2174	0.348321
8	934	in	-616.702852	1954	-1020	0.477994
9	921	to	-1112.090217	2396	-1475	0.384391
10	710	was	-39.608135	899	-189	0.789766
11	694	as	1.177540	664	30	1.045181
12	667	it	-22.394679	802	-135	0.831671
13	585	for	-97.381118	869	-284	0.673188
14	566	you	165.609370	301	265	1.880399
15	518	but	28.177335	401	117	1.291771
17	471	i	-0.008889	473	-2	0.995772
18	471	is	-254.363544	926	-455	0.508639
19	442	so	234.694974	181	261	2.441989
21	421	this	-5.137938	471	-50	0.893843
22	403	with	-113.644883	667	-264	0.604198
23	393	all	40.463187	274	119	1.434307
24	384	on	-94.773170	617	-233	0.622366
25	383	his	-112.694084	640	-257	0.598437
26	364	gadsby	NaN	0	364	inf
28	327	up	96.888501	173	154	1.890173
30	301	had	-64.783953	470	-169	0.640426
31	298	or	-19.827346	385	-87	0.774026
32	297	big	630.151174	32	265	9.281250
33	291	not	-42.458054	422	-131	0.689573
34	289	an	-8.174090	342	-53	0.845029
35	284	at	-96.568517	492	-208	0.577236
36	280	out	31.832340	192	88	1.458333
37	270	our	134.651348	114	156	2.368421
...	...	...	...	...	...	...
5122	1	wistaria	NaN	0	1	inf
5124	1	wit	-0.421448	1	0	1.000000
5126	1	withhold	1.487055	0	1	inf
5128	1	wobbly	1.487055	0	1	inf
5129	1	woild	NaN	0	1	inf
5130	1	wolf	0.265836	0	1	inf
5133	1	wondrous	2.358220	0	1	inf
5134	1	woo	0.982790	0	1	inf
5137	1	woodlands	NaN	0	1	inf
5138	1	woodwork	0.425399	0	1	inf
5141	1	wording	0.652778	0	1	inf
5143	1	worka	NaN	0	1	inf
5145	1	workmanship	0.265836	0	1	inf
5146	1	worldly	0.031798	0	1	inf
5147	1	worm	0.652778	0	1	inf
5150	1	worthington	NaN	0	1	inf
5153	1	wounds	0.079149	0	1	inf
5154	1	wracking	2.358220	0	1	inf
5155	1	wrapping	0.265836	0	1	inf
5157	1	wrath	0.031798	0	1	inf
5158	1	wriggly	NaN	0	1	inf
5161	1	wrought	0.982790	0	1	inf
5162	1	yak	NaN	0	1	inf
5163	1	yaks	2.358220	0	1	inf
5164	1	yank	0.154473	0	1	inf
5165	1	yanks	0.652778	0	1	inf
5172	1	yucatan	2.358220	0	1	inf
5173	1	zigzagging	0.652778	0	1	inf
5177	1	zoological	NaN	0	1	inf
5178	1	zooming	2.358220	0	1	inf

3934 rows × 6 columns

In [17]:

from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np

# x1_samples.shape -> (100, 2), 100 rows, 2 columns

plt.figure(figsize=(8,6))

dftemp = df[['word', 'log_likelihood', 'pct_the', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood < -100) | (dftemp.log_likelihood > 100)]

plt.scatter(dftemp.log_likelihood, dftemp.pct_the, s=dftemp.freq, marker='o', 
            color='blue', alpha=0.4, label='data')
#plt.scatter(x2_samples[:,0], x1_samples[:,1], marker='o', 
 #           color='green', alpha=0.7, label='x2 samples')
#plt.scatter(x3_samples[:,0], x1_samples[:,1], marker='^', 
 #           color='red', alpha=0.7, label='x3 samples')
plt.title(' ')
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighboring 'the' in Brown corpus")

Out[17]:

<matplotlib.text.Text at 0x1fc0a748>

In [ ]: