Single letter frequency comparison of Gadsby and Brown¶

by David Taylor, www.prooffreader.com, prooffreader@gmail.com

from a collection of tools to create and analyze lists of words using python with pandas and matplotlib

In [31]:

import pandas as pd
import os
import time

words = pd.read_pickle('gadsby_analysis.pickle')

alphabet = 'abcdefghijklmnopqrstuvwxyz'

def increment_dict(d, key, increment=1): # from before I found collections.Counter
    if key in d.keys():
        d[key] += increment
    else:
        d[key] = increment
    return d

Single letter frequencies¶

In [32]:

redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes

if not os.path.isfile('gadsby_analysis_letfreqs.pickle') or redo_pickle == True:    
    
    start = time.time()
    
    letfreq_dict = {}
    
    for ltr in alphabet:
        letfreq_dict[ltr] = 0
    
    for i in range(len(words)):
        wd = words.word.iloc[i]
        for ltr in wd:
            letfreq_dict[ltr] += words.freq.iloc[i]
                
    letfreqs_df = pd.DataFrame()
    for letter in alphabet:
        temp = pd.DataFrame({'letter':[letter],
                              'freq':[letfreq_dict[letter]]})
        letfreqs_df = letfreqs_df.append(temp, ignore_index=True)

    
    
    letfreqs_df['normal'] = 0.0

    letfreqsum = letfreqs_df.freq.sum()

    for i in range(len(letfreqs_df)):
        letter = letfreqs_df.letter.iloc[i]
        freq = letfreqs_df.freq.iloc[i]
        letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum

    letfreqs_df.set_index('letter', drop=False, inplace=True)
    
    print "%d seconds elapsed." % (time.time() - start)
    
    letfreqs_df.to_pickle('gadsby_analysis_letfreqs.pickle')

else:
    print 'Reading from pickle.'
    letfreqs_df = pd.read_pickle('gadsby_analysis_letfreqs.pickle')

    
df = letfreqs_df.copy()
df.columns = ['gadsby_freq', 'letter', 'gadsby_pct']
df = df[['letter', 'gadsby_freq', 'gadsby_pct']]
print df.head()

Reading from pickle.
       letter  gadsby_freq  gadsby_pct
letter                                
a           a        22105   11.139219
b           b         4270    2.151751
c           c         5217    2.628967
d           d         8234    4.149302
e           e            0    0.000000

In [3]:

redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes

brown = pd.read_pickle('brown_non_e.pickle')

if not os.path.isfile('brown_non_e_leftfreq.pickle') or redo_pickle == True:    
    
    start = time.time()
    
    letfreq_dict = {}
    
    for ltr in alphabet:
        letfreq_dict[ltr] = 0
    
    for i in range(len(words)):
        wd = brown.word.iloc[i]
        for ltr in wd:
            letfreq_dict[ltr] += brown.freq.iloc[i]
                
    letfreqs_df = pd.DataFrame()
    for letter in alphabet:
        temp = pd.DataFrame({'letter':[letter],
                              'freq':[letfreq_dict[letter]]})
        letfreqs_df = letfreqs_df.append(temp, ignore_index=True)

    letfreqs_df['normal'] = 0.0

    letfreqsum = letfreqs_df.freq.sum()

    for i in range(len(letfreqs_df)):
        letter = letfreqs_df.letter.iloc[i]
        freq = letfreqs_df.freq.iloc[i]
        letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum

    letfreqs_df.set_index('letter', drop=False, inplace=True)
    
    print "%d seconds elapsed." % (time.time() - start)
    
    letfreqs_df.to_pickle('brown_non_e_letfreqs.pickle')

else:
    print 'Reading from pickle.'
    letfreqs_df = pd.read_pickle('brown_non_e_letfreqs.pickle')
    
brown = letfreqs_df.copy()
brown.columns = ['brown_freq', 'letter', 'brown_pct']
brown = brown[['letter', 'brown_freq', 'brown_pct']]
print brown.head()

1 seconds elapsed.
       letter  brown_freq  brown_pct
letter                              
a           a      208195  11.319178
b           b       27813   1.512141
c           c       48343   2.628320
d           d       78828   4.285733
e           e           0   0.000000

In [33]:

df = pd.merge(df, brown, how='inner', on='letter', left_index=True, right_index = False)

In [34]:

df['brown_normalized'] = (df.brown_freq * df.gadsby_freq.sum()/df.brown_freq.sum()).astype(int)
df['diff'] = df.gadsby_pct - df.brown_pct
df['ratio'] = df.gadsby_pct / df.brown_pct

In [35]:

def loglike(n1, t1, n2, t2):
    """Calculates Dunning log likelihood of an observation of 
    frequency n1 in a corpus of size t1, compared to a frequency n2 
    in a corpus of size t2. If result is positive, it is more 
    likely to occur in corpus 1, otherwise in corpus 2."""
    e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
    e2 = t2*1.0*(n1+n2)/(t1+t2)
    LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
    if n2*1.0/t2 > n1*1.0/t1:
        LL = -LL
    return LL

In [36]:

from numpy import log

t1 = df.gadsby_freq.sum()
t2 = df.brown_freq.sum()
df['log_likelihood'] = 0.0

for i in range(len(df)):
    n1 = df.gadsby_freq.iloc[i]
    n2 = df.brown_freq.iloc[i]
    df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)

In [8]:

df

Out[8]:

	letter	gadsby_freq	gadsby_pct	brown_freq	brown_pct	brown_normalized	diff	ratio	log_likelihood
letter
a	a	22105	11.139219	208195	11.319178	22462	-0.179959	0.984101	-5.154793
b	b	4270	2.151751	27813	1.512141	3000	0.639610	1.422983	422.428550
c	c	5217	2.628967	48343	2.628320	5215	0.000647	1.000246	0.000285
d	d	8234	4.149302	78828	4.285733	8504	-0.136431	0.968166	-7.871319
e	e	0	0.000000	0	0.000000	0	0.000000	NaN	NaN
f	f	4368	2.201136	71896	3.908853	7756	-1.707717	0.563116	-1614.594975
g	g	7175	3.615648	38652	2.101438	4170	1.514210	1.720559	1571.243993
h	h	9836	4.956587	93001	5.056293	10033	-0.099706	0.980281	-3.547318
i	i	17550	8.843849	178746	9.718090	19284	-0.874241	0.910040	-145.728668
j	j	466	0.234828	3361	0.182731	362	0.052097	1.285100	24.170986
k	k	2310	1.164062	12785	0.695097	1379	0.468965	1.674676	461.058467
l	l	10449	5.265492	77191	4.196732	8328	1.068760	1.254665	447.169631
m	m	4099	2.065581	41491	2.255789	4476	-0.190209	0.915680	-29.653479
n	n	16926	8.529401	159318	8.661826	17188	-0.132424	0.984712	-3.646751
o	o	20823	10.493189	224504	12.205868	24221	-1.712679	0.859684	-454.252165
p	p	3710	1.869554	27267	1.482456	2941	0.387098	1.261119	165.729733
q	q	103	0.051904	684	0.037188	73	0.014716	1.395728	9.165714
r	r	9426	4.749979	81398	4.425459	8782	0.324519	1.073330	41.521963
s	s	13389	6.747026	120158	6.532769	12963	0.214257	1.032797	12.438128
t	t	16770	8.450789	177992	9.677097	19203	-1.226307	0.873277	-292.146957
u	u	8338	4.201710	64735	3.519523	6984	0.682188	1.193830	221.523997
v	v	621	0.312936	5171	0.281138	557	0.031798	1.113106	6.189885
w	w	5633	2.838598	53064	2.884992	5725	-0.046393	0.983919	-1.344228
x	x	87	0.043841	1117	0.060729	120	-0.016888	0.721914	-9.412268
y	y	6333	3.191345	42703	2.321683	4607	0.869661	1.374582	515.920749
z	z	205	0.103304	899	0.048877	96	0.054427	2.113556	79.507689

In [37]:

df.to_csv('gadsby_letter_analysis.csv')

In [9]:

print df.sort('log_likelihood', ascending=False).head(10)

       letter  gadsby_freq  gadsby_pct  brown_freq  brown_pct  \
letter                                                          
g           g         7175    3.615648       38652   2.101438   
y           y         6333    3.191345       42703   2.321683   
k           k         2310    1.164062       12785   0.695097   
l           l        10449    5.265492       77191   4.196732   
b           b         4270    2.151751       27813   1.512141   
u           u         8338    4.201710       64735   3.519523   
p           p         3710    1.869554       27267   1.482456   
z           z          205    0.103304         899   0.048877   
r           r         9426    4.749979       81398   4.425459   
j           j          466    0.234828        3361   0.182731   

        brown_normalized      diff     ratio  log_likelihood  
letter                                                        
g                   4170  1.514210  1.720559     1571.243993  
y                   4607  0.869661  1.374582      515.920749  
k                   1379  0.468965  1.674676      461.058467  
l                   8328  1.068760  1.254665      447.169631  
b                   3000  0.639610  1.422983      422.428550  
u                   6984  0.682188  1.193830      221.523997  
p                   2941  0.387098  1.261119      165.729733  
z                     96  0.054427  2.113556       79.507689  
r                   8782  0.324519  1.073330       41.521963  
j                    362  0.052097  1.285100       24.170986

In [10]:

print df.sort('log_likelihood', ascending=True).head(20)

       letter  gadsby_freq  gadsby_pct  brown_freq  brown_pct  \
letter                                                          
f           f         4368    2.201136       71896   3.908853   
o           o        20823   10.493189      224504  12.205868   
t           t        16770    8.450789      177992   9.677097   
i           i        17550    8.843849      178746   9.718090   
m           m         4099    2.065581       41491   2.255789   
x           x           87    0.043841        1117   0.060729   
d           d         8234    4.149302       78828   4.285733   
a           a        22105   11.139219      208195  11.319178   
n           n        16926    8.529401      159318   8.661826   
h           h         9836    4.956587       93001   5.056293   
w           w         5633    2.838598       53064   2.884992   
c           c         5217    2.628967       48343   2.628320   
v           v          621    0.312936        5171   0.281138   
q           q          103    0.051904         684   0.037188   
s           s        13389    6.747026      120158   6.532769   
j           j          466    0.234828        3361   0.182731   
r           r         9426    4.749979       81398   4.425459   
z           z          205    0.103304         899   0.048877   
p           p         3710    1.869554       27267   1.482456   
u           u         8338    4.201710       64735   3.519523   

        brown_normalized      diff     ratio  log_likelihood  
letter                                                        
f                   7756 -1.707717  0.563116    -1614.594975  
o                  24221 -1.712679  0.859684     -454.252165  
t                  19203 -1.226307  0.873277     -292.146957  
i                  19284 -0.874241  0.910040     -145.728668  
m                   4476 -0.190209  0.915680      -29.653479  
x                    120 -0.016888  0.721914       -9.412268  
d                   8504 -0.136431  0.968166       -7.871319  
a                  22462 -0.179959  0.984101       -5.154793  
n                  17188 -0.132424  0.984712       -3.646751  
h                  10033 -0.099706  0.980281       -3.547318  
w                   5725 -0.046393  0.983919       -1.344228  
c                   5215  0.000647  1.000246        0.000285  
v                    557  0.031798  1.113106        6.189885  
q                     73  0.014716  1.395728        9.165714  
s                  12963  0.214257  1.032797       12.438128  
j                    362  0.052097  1.285100       24.170986  
r                   8782  0.324519  1.073330       41.521963  
z                     96  0.054427  2.113556       79.507689  
p                   2941  0.387098  1.261119      165.729733  
u                   6984  0.682188  1.193830      221.523997

In [12]:

import matplotlib.pyplot as plt
%matplotlib inline

In [21]:

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn

x = list(df.brown_pct)
y = list(df.gadsby_pct)
s = list(df.gadsby_freq / 10)

fig, ax = plt.subplots(figsize=(12,9))
#plt.style.use('bmh')
ax.scatter(x,y,s=s,c='b',alpha=0.3, marker='o', color='steelblue')
plt.plot([0,13], [0,13], linestyle='-', marker='None', color='r', markersize=0)
ax.set_xlim(0, 13)
ax.set_ylim(0, 13)
ax.set_ylabel("Percent frequency in Gadsby", fontsize=14)
ax.set_xlabel("Percent frequency in Brown without 'e'-containing words", fontsize=14)
ax.set_title("Comparison of letter frequencies in Gadsby, a novel without\nthe letter 'e', \
and the Brown corpus with 'e'-containing words removed", fontsize=18)
txt = []
for i, ltr in enumerate(alphabet):
    if ltr != 'e':
        txt.append( ax.text(x[i], y[i], ltr, ha="center", va="center", rotation=0,
            size=s[i]/40 ) )

plt.show()

In [30]:

import plotly.plotly as py
from plotly.graph_objs import *

#fill this in with your username and api key
py.sign_in(open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_emanresu.txt', 'r').read(), 
           open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_yekipa.txt', 'r').read())

data = Data([
    Bar(
        x=list(df.letter),
        y=list(df.log_likelihood)
    )])

layout=Layout(
        title="Comparison of letter frequencies in Gadsby, a novel without the letter 'e', and \
        the Brown corpus with 'e'-containing words removed",
        yaxis=YAxis(
            title='Log Likelihood keyness'
            ) 
    )

#plot_url = py.plot(data, layout=layout, filename='gadsby_letter_keyness')
py.iplot(data, layout=layout, filename='gadsby_letter_keyness')

Out[30]:

In [ ]: