Single letter frequency comparison of Gadsby and Brown

by David Taylor, www.prooffreader.com, [email protected]

from a collection of tools to create and analyze lists of words using python with pandas and matplotlib

In [31]:
import pandas as pd
import os
import time

words = pd.read_pickle('gadsby_analysis.pickle')

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def increment_dict(d, key, increment=1): # from before I found collections.Counter
    if key in d.keys():
        d[key] += increment
    else:
        d[key] = increment
    return d

Single letter frequencies

In [32]:
redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes

if not os.path.isfile('gadsby_analysis_letfreqs.pickle') or redo_pickle == True:    
    
    start = time.time()
    
    letfreq_dict = {}
    
    for ltr in alphabet:
        letfreq_dict[ltr] = 0
    
    for i in range(len(words)):
        wd = words.word.iloc[i]
        for ltr in wd:
            letfreq_dict[ltr] += words.freq.iloc[i]
                
    letfreqs_df = pd.DataFrame()
    for letter in alphabet:
        temp = pd.DataFrame({'letter':[letter],
                              'freq':[letfreq_dict[letter]]})
        letfreqs_df = letfreqs_df.append(temp, ignore_index=True)

    
    
    letfreqs_df['normal'] = 0.0

    letfreqsum = letfreqs_df.freq.sum()

    for i in range(len(letfreqs_df)):
        letter = letfreqs_df.letter.iloc[i]
        freq = letfreqs_df.freq.iloc[i]
        letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum

    letfreqs_df.set_index('letter', drop=False, inplace=True)
    
    print "%d seconds elapsed." % (time.time() - start)
    
    letfreqs_df.to_pickle('gadsby_analysis_letfreqs.pickle')

else:
    print 'Reading from pickle.'
    letfreqs_df = pd.read_pickle('gadsby_analysis_letfreqs.pickle')

    
df = letfreqs_df.copy()
df.columns = ['gadsby_freq', 'letter', 'gadsby_pct']
df = df[['letter', 'gadsby_freq', 'gadsby_pct']]
print df.head()
Reading from pickle.
       letter  gadsby_freq  gadsby_pct
letter                                
a           a        22105   11.139219
b           b         4270    2.151751
c           c         5217    2.628967
d           d         8234    4.149302
e           e            0    0.000000
In [3]:
redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes

brown = pd.read_pickle('brown_non_e.pickle')

if not os.path.isfile('brown_non_e_leftfreq.pickle') or redo_pickle == True:    
    
    start = time.time()
    
    letfreq_dict = {}
    
    for ltr in alphabet:
        letfreq_dict[ltr] = 0
    
    for i in range(len(words)):
        wd = brown.word.iloc[i]
        for ltr in wd:
            letfreq_dict[ltr] += brown.freq.iloc[i]
                
    letfreqs_df = pd.DataFrame()
    for letter in alphabet:
        temp = pd.DataFrame({'letter':[letter],
                              'freq':[letfreq_dict[letter]]})
        letfreqs_df = letfreqs_df.append(temp, ignore_index=True)

    letfreqs_df['normal'] = 0.0

    letfreqsum = letfreqs_df.freq.sum()

    for i in range(len(letfreqs_df)):
        letter = letfreqs_df.letter.iloc[i]
        freq = letfreqs_df.freq.iloc[i]
        letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum

    letfreqs_df.set_index('letter', drop=False, inplace=True)
    
    print "%d seconds elapsed." % (time.time() - start)
    
    letfreqs_df.to_pickle('brown_non_e_letfreqs.pickle')

else:
    print 'Reading from pickle.'
    letfreqs_df = pd.read_pickle('brown_non_e_letfreqs.pickle')
    
brown = letfreqs_df.copy()
brown.columns = ['brown_freq', 'letter', 'brown_pct']
brown = brown[['letter', 'brown_freq', 'brown_pct']]
print brown.head()
1 seconds elapsed.
       letter  brown_freq  brown_pct
letter                              
a           a      208195  11.319178
b           b       27813   1.512141
c           c       48343   2.628320
d           d       78828   4.285733
e           e           0   0.000000
In [33]:
df = pd.merge(df, brown, how='inner', on='letter', left_index=True, right_index = False)
In [34]:
df['brown_normalized'] = (df.brown_freq * df.gadsby_freq.sum()/df.brown_freq.sum()).astype(int)
df['diff'] = df.gadsby_pct - df.brown_pct
df['ratio'] = df.gadsby_pct / df.brown_pct
In [35]:
def loglike(n1, t1, n2, t2):
    """Calculates Dunning log likelihood of an observation of 
    frequency n1 in a corpus of size t1, compared to a frequency n2 
    in a corpus of size t2. If result is positive, it is more 
    likely to occur in corpus 1, otherwise in corpus 2."""
    e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
    e2 = t2*1.0*(n1+n2)/(t1+t2)
    LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
    if n2*1.0/t2 > n1*1.0/t1:
        LL = -LL
    return LL
In [36]:
from numpy import log

t1 = df.gadsby_freq.sum()
t2 = df.brown_freq.sum()
df['log_likelihood'] = 0.0

for i in range(len(df)):
    n1 = df.gadsby_freq.iloc[i]
    n2 = df.brown_freq.iloc[i]
    df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)
In [8]:
df
Out[8]:
letter gadsby_freq gadsby_pct brown_freq brown_pct brown_normalized diff ratio log_likelihood
letter
a a 22105 11.139219 208195 11.319178 22462 -0.179959 0.984101 -5.154793
b b 4270 2.151751 27813 1.512141 3000 0.639610 1.422983 422.428550
c c 5217 2.628967 48343 2.628320 5215 0.000647 1.000246 0.000285
d d 8234 4.149302 78828 4.285733 8504 -0.136431 0.968166 -7.871319
e e 0 0.000000 0 0.000000 0 0.000000 NaN NaN
f f 4368 2.201136 71896 3.908853 7756 -1.707717 0.563116 -1614.594975
g g 7175 3.615648 38652 2.101438 4170 1.514210 1.720559 1571.243993
h h 9836 4.956587 93001 5.056293 10033 -0.099706 0.980281 -3.547318
i i 17550 8.843849 178746 9.718090 19284 -0.874241 0.910040 -145.728668
j j 466 0.234828 3361 0.182731 362 0.052097 1.285100 24.170986
k k 2310 1.164062 12785 0.695097 1379 0.468965 1.674676 461.058467
l l 10449 5.265492 77191 4.196732 8328 1.068760 1.254665 447.169631
m m 4099 2.065581 41491 2.255789 4476 -0.190209 0.915680 -29.653479
n n 16926 8.529401 159318 8.661826 17188 -0.132424 0.984712 -3.646751
o o 20823 10.493189 224504 12.205868 24221 -1.712679 0.859684 -454.252165
p p 3710 1.869554 27267 1.482456 2941 0.387098 1.261119 165.729733
q q 103 0.051904 684 0.037188 73 0.014716 1.395728 9.165714
r r 9426 4.749979 81398 4.425459 8782 0.324519 1.073330 41.521963
s s 13389 6.747026 120158 6.532769 12963 0.214257 1.032797 12.438128
t t 16770 8.450789 177992 9.677097 19203 -1.226307 0.873277 -292.146957
u u 8338 4.201710 64735 3.519523 6984 0.682188 1.193830 221.523997
v v 621 0.312936 5171 0.281138 557 0.031798 1.113106 6.189885
w w 5633 2.838598 53064 2.884992 5725 -0.046393 0.983919 -1.344228
x x 87 0.043841 1117 0.060729 120 -0.016888 0.721914 -9.412268
y y 6333 3.191345 42703 2.321683 4607 0.869661 1.374582 515.920749
z z 205 0.103304 899 0.048877 96 0.054427 2.113556 79.507689
In [37]:
df.to_csv('gadsby_letter_analysis.csv')
In [9]:
print df.sort('log_likelihood', ascending=False).head(10)
       letter  gadsby_freq  gadsby_pct  brown_freq  brown_pct  \
letter                                                          
g           g         7175    3.615648       38652   2.101438   
y           y         6333    3.191345       42703   2.321683   
k           k         2310    1.164062       12785   0.695097   
l           l        10449    5.265492       77191   4.196732   
b           b         4270    2.151751       27813   1.512141   
u           u         8338    4.201710       64735   3.519523   
p           p         3710    1.869554       27267   1.482456   
z           z          205    0.103304         899   0.048877   
r           r         9426    4.749979       81398   4.425459   
j           j          466    0.234828        3361   0.182731   

        brown_normalized      diff     ratio  log_likelihood  
letter                                                        
g                   4170  1.514210  1.720559     1571.243993  
y                   4607  0.869661  1.374582      515.920749  
k                   1379  0.468965  1.674676      461.058467  
l                   8328  1.068760  1.254665      447.169631  
b                   3000  0.639610  1.422983      422.428550  
u                   6984  0.682188  1.193830      221.523997  
p                   2941  0.387098  1.261119      165.729733  
z                     96  0.054427  2.113556       79.507689  
r                   8782  0.324519  1.073330       41.521963  
j                    362  0.052097  1.285100       24.170986  
In [10]:
print df.sort('log_likelihood', ascending=True).head(20)
       letter  gadsby_freq  gadsby_pct  brown_freq  brown_pct  \
letter                                                          
f           f         4368    2.201136       71896   3.908853   
o           o        20823   10.493189      224504  12.205868   
t           t        16770    8.450789      177992   9.677097   
i           i        17550    8.843849      178746   9.718090   
m           m         4099    2.065581       41491   2.255789   
x           x           87    0.043841        1117   0.060729   
d           d         8234    4.149302       78828   4.285733   
a           a        22105   11.139219      208195  11.319178   
n           n        16926    8.529401      159318   8.661826   
h           h         9836    4.956587       93001   5.056293   
w           w         5633    2.838598       53064   2.884992   
c           c         5217    2.628967       48343   2.628320   
v           v          621    0.312936        5171   0.281138   
q           q          103    0.051904         684   0.037188   
s           s        13389    6.747026      120158   6.532769   
j           j          466    0.234828        3361   0.182731   
r           r         9426    4.749979       81398   4.425459   
z           z          205    0.103304         899   0.048877   
p           p         3710    1.869554       27267   1.482456   
u           u         8338    4.201710       64735   3.519523   

        brown_normalized      diff     ratio  log_likelihood  
letter                                                        
f                   7756 -1.707717  0.563116    -1614.594975  
o                  24221 -1.712679  0.859684     -454.252165  
t                  19203 -1.226307  0.873277     -292.146957  
i                  19284 -0.874241  0.910040     -145.728668  
m                   4476 -0.190209  0.915680      -29.653479  
x                    120 -0.016888  0.721914       -9.412268  
d                   8504 -0.136431  0.968166       -7.871319  
a                  22462 -0.179959  0.984101       -5.154793  
n                  17188 -0.132424  0.984712       -3.646751  
h                  10033 -0.099706  0.980281       -3.547318  
w                   5725 -0.046393  0.983919       -1.344228  
c                   5215  0.000647  1.000246        0.000285  
v                    557  0.031798  1.113106        6.189885  
q                     73  0.014716  1.395728        9.165714  
s                  12963  0.214257  1.032797       12.438128  
j                    362  0.052097  1.285100       24.170986  
r                   8782  0.324519  1.073330       41.521963  
z                     96  0.054427  2.113556       79.507689  
p                   2941  0.387098  1.261119      165.729733  
u                   6984  0.682188  1.193830      221.523997  
In [12]:
import matplotlib.pyplot as plt
%matplotlib inline
In [21]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn

x = list(df.brown_pct)
y = list(df.gadsby_pct)
s = list(df.gadsby_freq / 10)

fig, ax = plt.subplots(figsize=(12,9))
#plt.style.use('bmh')
ax.scatter(x,y,s=s,c='b',alpha=0.3, marker='o', color='steelblue')
plt.plot([0,13], [0,13], linestyle='-', marker='None', color='r', markersize=0)
ax.set_xlim(0, 13)
ax.set_ylim(0, 13)
ax.set_ylabel("Percent frequency in Gadsby", fontsize=14)
ax.set_xlabel("Percent frequency in Brown without 'e'-containing words", fontsize=14)
ax.set_title("Comparison of letter frequencies in Gadsby, a novel without\nthe letter 'e', \
and the Brown corpus with 'e'-containing words removed", fontsize=18)
txt = []
for i, ltr in enumerate(alphabet):
    if ltr != 'e':
        txt.append( ax.text(x[i], y[i], ltr, ha="center", va="center", rotation=0,
            size=s[i]/40 ) )

plt.show()
In [30]:
import plotly.plotly as py
from plotly.graph_objs import *

#fill this in with your username and api key
py.sign_in(open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_emanresu.txt', 'r').read(), 
           open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_yekipa.txt', 'r').read())

data = Data([
    Bar(
        x=list(df.letter),
        y=list(df.log_likelihood)
    )])

layout=Layout(
        title="Comparison of letter frequencies in Gadsby, a novel without the letter 'e', and \
        the Brown corpus with 'e'-containing words removed",
        yaxis=YAxis(
            title='Log Likelihood keyness'
            ) 
    )

#plot_url = py.plot(data, layout=layout, filename='gadsby_letter_keyness')
py.iplot(data, layout=layout, filename='gadsby_letter_keyness')
Out[30]: