Load Gadsby and create dataframe of frequency distribution

In [1]:
import re
import os
import nltk
raw = open('gadsby_full_lower.txt', 'r').read()
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)

# summary statistics
print ' '
print "Characters: {}".format(len(raw))
print "Tokens: {}".format(len(tokens))
print "Unique tokens: {}".format(len(set(tokens)))
print "Lexical diversity: {:.3f}".format(len(set(tokens))*1.0/len(tokens))

# create frequency distribution dataframe
fdist = nltk.FreqDist(text)
import pandas as pd
df = pd.DataFrame({'token': fdist.keys(), 'freq':fdist.values()})
df = df[~df.token.str.contains('[^A-Za-z]')] # remove tokens with non-alphabetic characters
df.columns = ['freq', 'word']
print ' '
print df.head()
 
Characters: 277418
Tokens: 61759
Unique tokens: 5233
Lexical diversity: 0.085
 
   freq  word
1  2437     a
2  1669   and
4  1225  that
6  1162    of
8   934    in
In [2]:
#sanity check
print df[df.word.str.contains('e')]
Empty DataFrame
Columns: [freq, word]
Index: []

Make frequency dataframe of Brown corpus words not containing 'e'

In [3]:
if not os.path.isfile('brown_df.pickle'):
    print "Processing Brown corpus from NLTK"
    from nltk.corpus import brown
    categories = []
    words = []
    frequencies = []
    for category in brown.categories():
        wordlist = brown.words(categories=category)
        freqs = nltk.FreqDist([w.lower() for w in wordlist])
        for key in freqs.keys():
            categories.append(category)
            words.append(key)
            frequencies.append(freqs[key])
    brown_df = pd.DataFrame({'word':words, 'freq':frequencies, 'category':categories})
    brown_df['nonalpha'] = False
    brown_df['nonalpha'][brown_df.word.str.contains('[^A-Za-z]')] = True
    brown_df.to_pickle('brown_df.pickle')
else:
    print "Reading brown_df.pickle"
    brown_df = pd.read_pickle('brown_df.pickle')
Reading brown_df.pickle
In [4]:
if not os.path.isfile('brown_non_e.pickle'):
    print 'Creating dataframe.'
    total_freq = 0
    weighted_length = 0
    brown_words = brown_df[brown_df.nonalpha == False]
    brown_words = brown_words[~brown_words.word.str.contains('e')]
    brown_words = pd.DataFrame(brown_words.groupby(['word']).sum()).reset_index(drop=False)
    brown_words['length'] = 0
    brown_words.sort('freq', ascending = False, inplace=True)
    total_freq = brown_words.freq.sum()
    median_counter = total_freq / 2
    median_found = False
    for idx, row in brown_words.iterrows():
        curr_len = len(brown_words.word[idx])
        brown_words.length[idx] = curr_len
        weighted_length += brown_words.freq[idx] * curr_len
        median_counter -= curr_len * brown_words.freq[idx]
        if median_counter < 0 and median_found == False:
            wt_median_len = curr_len
            median_found = True
    brown_words = brown_words[['word', 'freq', 'length']]
    brown_words.to_pickle('brown_non_e.pickle')
    
else:
    print 'Reading pickle.'
    brown_words = pd.read_pickle('brown_non_e.pickle')
Reading pickle.

Determine log likelihood of word frequencies, comparing Gadsby to Brown-without-e

In [5]:
def loglike(n1, t1, n2, t2):
    """Calculates Dunning log likelihood of an observation of 
    frequency n1 in a corpus of size t1, compared to a frequency n2 
    in a corpus of size t2. If result is positive, it is more 
    likely to occur in corpus 1, otherwise in corpus 2."""
    e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
    e2 = t2*1.0*(n1+n2)/(t1+t2)
    LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
    if n2*1.0/t2 > n1*1.0/t1:
        LL = -LL
    return LL
In [6]:
from numpy import log

t1 = df.freq.sum()
t2 = brown_words.freq.sum()
df['log_likelihood'] = 0.0

for i in range(len(df)):
    word = df.word.iloc[i]
    n1 = df.freq.iloc[i]
    fnd = brown_words[brown_words.word == word]
    if len(fnd) > 0:
        n2 = fnd.freq.iloc[0]
    else:
        n2 = 0
    df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)
In [7]:
print df.sort('log_likelihood', ascending=False).head(20)
     freq        word  log_likelihood
54    200       hills      749.660964
32    297         big      630.151174
79    122       nancy      563.296979
71    138        lady      411.288896
68    140        bill      326.549776
108    80       folks      306.118227
95     97       honor      272.219017
59    187       young      271.193850
85    108       happy      267.283070
112    76       sarah      265.372978
44    237         old      253.839785
136    60    simpkins      253.691283
19    442          so      234.694974
47    229        know      226.664917
152    52  councilman      224.677398
176    46        nina      218.448458
87    106       girls      211.621005
126    66        tiny      177.231911
103    89          oh      175.102218
131    63       grand      168.765580
In [8]:
print df.sort('log_likelihood', ascending=True).head(20)
      freq     word  log_likelihood
6     1162       of    -1776.113782
9      921       to    -1112.090217
8      934       in     -616.702852
2     1669      and     -383.521998
78     124       by     -362.344996
18     471       is     -254.363544
22     403     with     -113.644883
25     383      his     -112.694084
96      96      him     -104.650941
104     86      has     -103.254890
13     585      for      -97.381118
35     284       at      -96.568517
24     384       on      -94.773170
228     34      may      -92.570425
30     301      had      -64.783953
4378     1  program      -60.080679
2736     1    among      -55.998265
247     30     must      -54.226455
41     259     from      -52.902273
33     291      not      -42.458054

Complete analysis dataframe

  • Add column of Brown non-e frequency normalized to Gadsby corpus length
  • Add absolute and relative frequency differences between Gasby and Brown non-e
  • Add frequency of neighboring 'the' or 'e'-containing word in original Brown corpus

Note: this takes a long time.

In [13]:
import time
import numpy as np
from nltk.corpus import brown
br_trig = nltk.trigrams(brown.words())

if 2>1: #not os.path.isfile('gadsby_analysis.pickle'):
    start = time.time()
    print "Building dataframe."

    freq_g = df.freq.sum()
    freq_b = brown_words.freq.sum()

    df['brown_freq_normalized'] = 0.0
    for i in range(len(df)):
        word = df.word.iloc[i]
        try:
            brown_freq = brown_words[brown_words.word == word].freq.iloc[0]
        except:
            brown_freq = 0
        try:
            df.brown_freq_normalized.iloc[i] = brown_freq * freq_g / freq_b
        except:
            df.brown_freq_normalized.iloc[i] = np.nan

    df['diff_absolute'] = df.freq - df.brown_freq_normalized
    df['diff_relative'] = df.freq*1.0/df.brown_freq_normalized
    
    wprev = []
    word = []
    wnext = []
    for item in br_trig:
        a = item[0].lower()
        b = item[1].lower()
        c = item[2].lower()
        if (re.search('[a-z]', a) and
            re.search('[a-z]', b) and
            re.search('[a-z]', c) ):
            wprev.append(a)
            word.append(b)
            wnext.append(c)

    tri = pd.DataFrame({'wprev': wprev,
                        'word': word,
                        'wnext': wnext})

    def calc_pcte(row):
        dftemp = tri[tri.word == row.word]
        total = len(dftemp)
        dftempe = dftemp[(dftemp.wprev.str.contains('e') ) | (dftemp.wnext.str.contains('e'))]
        try:
            return len(dftempe) * 100.0 / total
        except:
            return np.nan

    def calc_pctthe(row):
        dftemp = tri[tri.word == row.word]
        total = len(dftemp)
        dftempthe = dftemp[(dftemp.wprev == 'the' ) | (dftemp.wnext == 'the')]
        try:
            return len(dftempthe) * 100.0 / total
        except:
            return np.nan

    df['pct_e'] = df.apply(calc_pcte, axis=1)
    df['pct_the'] = df.apply(calc_pctthe, axis=1)

    df.to_pickle('gadsby_analysis.pickle')
    df.to_csv('gadsby_analysis.csv')
    print "Done. {} minutes elapsed.".format(round((time.time() - start) / 60, 1))
    
else:
    print "Reading pickle."
    df = pd.read_pickle('gadsby_analysis.pickle')
Building dataframe.
Done. 17.2 minutes elapsed.

Graph 'volcano plot' of

In [14]:
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import seaborn

plt.figure(figsize=(9,6))

dftemp = df[['word', 'log_likelihood', 'pct_e', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood > -50) & (dftemp.log_likelihood < 50)]
dftemp = dftemp[(dftemp.freq > 10)]

plt.scatter(dftemp.log_likelihood, dftemp.pct_e, s=dftemp.freq*1.5, marker='o', 
            color='blue', alpha=0.15, label='data')
plt.title(' ')
#plt.xlim(-10,10)
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighbouring 'e' in Brown corpus")

plt.show()
In [12]:
df
Out[12]:
freq word log_likelihood brown_freq_normalized diff_absolute diff_relative
1 2437 a 39.837277 2125 312 1.146824
2 1669 and -383.521998 2643 -974 0.631479
4 1225 that 55.933699 970 255 1.262887
6 1162 of -1776.113782 3336 -2174 0.348321
8 934 in -616.702852 1954 -1020 0.477994
9 921 to -1112.090217 2396 -1475 0.384391
10 710 was -39.608135 899 -189 0.789766
11 694 as 1.177540 664 30 1.045181
12 667 it -22.394679 802 -135 0.831671
13 585 for -97.381118 869 -284 0.673188
14 566 you 165.609370 301 265 1.880399
15 518 but 28.177335 401 117 1.291771
17 471 i -0.008889 473 -2 0.995772
18 471 is -254.363544 926 -455 0.508639
19 442 so 234.694974 181 261 2.441989
21 421 this -5.137938 471 -50 0.893843
22 403 with -113.644883 667 -264 0.604198
23 393 all 40.463187 274 119 1.434307
24 384 on -94.773170 617 -233 0.622366
25 383 his -112.694084 640 -257 0.598437
26 364 gadsby NaN 0 364 inf
28 327 up 96.888501 173 154 1.890173
30 301 had -64.783953 470 -169 0.640426
31 298 or -19.827346 385 -87 0.774026
32 297 big 630.151174 32 265 9.281250
33 291 not -42.458054 422 -131 0.689573
34 289 an -8.174090 342 -53 0.845029
35 284 at -96.568517 492 -208 0.577236
36 280 out 31.832340 192 88 1.458333
37 270 our 134.651348 114 156 2.368421
... ... ... ... ... ... ...
5122 1 wistaria NaN 0 1 inf
5124 1 wit -0.421448 1 0 1.000000
5126 1 withhold 1.487055 0 1 inf
5128 1 wobbly 1.487055 0 1 inf
5129 1 woild NaN 0 1 inf
5130 1 wolf 0.265836 0 1 inf
5133 1 wondrous 2.358220 0 1 inf
5134 1 woo 0.982790 0 1 inf
5137 1 woodlands NaN 0 1 inf
5138 1 woodwork 0.425399 0 1 inf
5141 1 wording 0.652778 0 1 inf
5143 1 worka NaN 0 1 inf
5145 1 workmanship 0.265836 0 1 inf
5146 1 worldly 0.031798 0 1 inf
5147 1 worm 0.652778 0 1 inf
5150 1 worthington NaN 0 1 inf
5153 1 wounds 0.079149 0 1 inf
5154 1 wracking 2.358220 0 1 inf
5155 1 wrapping 0.265836 0 1 inf
5157 1 wrath 0.031798 0 1 inf
5158 1 wriggly NaN 0 1 inf
5161 1 wrought 0.982790 0 1 inf
5162 1 yak NaN 0 1 inf
5163 1 yaks 2.358220 0 1 inf
5164 1 yank 0.154473 0 1 inf
5165 1 yanks 0.652778 0 1 inf
5172 1 yucatan 2.358220 0 1 inf
5173 1 zigzagging 0.652778 0 1 inf
5177 1 zoological NaN 0 1 inf
5178 1 zooming 2.358220 0 1 inf

3934 rows × 6 columns

In [17]:
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np

# x1_samples.shape -> (100, 2), 100 rows, 2 columns

plt.figure(figsize=(8,6))

dftemp = df[['word', 'log_likelihood', 'pct_the', 'freq']].dropna()
#dftemp = dftemp[(dftemp.log_likelihood < -100) | (dftemp.log_likelihood > 100)]

plt.scatter(dftemp.log_likelihood, dftemp.pct_the, s=dftemp.freq, marker='o', 
            color='blue', alpha=0.4, label='data')
#plt.scatter(x2_samples[:,0], x1_samples[:,1], marker='o', 
 #           color='green', alpha=0.7, label='x2 samples')
#plt.scatter(x3_samples[:,0], x1_samples[:,1], marker='^', 
 #           color='red', alpha=0.7, label='x3 samples')
plt.title(' ')
plt.xlabel('Log-Likelihood that word is overrepresented in Gadsby')
plt.ylabel("Percent probability of neighboring 'the' in Brown corpus")
Out[17]:
<matplotlib.text.Text at 0x1fc0a748>