by David Taylor, www.prooffreader.com, prooffreader@gmail.com
from a collection of tools to create and analyze lists of words using python with pandas and matplotlib
import pandas as pd
import os
import time
words = pd.read_pickle('gadsby_analysis.pickle')
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def increment_dict(d, key, increment=1): # from before I found collections.Counter
if key in d.keys():
d[key] += increment
else:
d[key] = increment
return d
redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes
if not os.path.isfile('gadsby_analysis_letfreqs.pickle') or redo_pickle == True:
start = time.time()
letfreq_dict = {}
for ltr in alphabet:
letfreq_dict[ltr] = 0
for i in range(len(words)):
wd = words.word.iloc[i]
for ltr in wd:
letfreq_dict[ltr] += words.freq.iloc[i]
letfreqs_df = pd.DataFrame()
for letter in alphabet:
temp = pd.DataFrame({'letter':[letter],
'freq':[letfreq_dict[letter]]})
letfreqs_df = letfreqs_df.append(temp, ignore_index=True)
letfreqs_df['normal'] = 0.0
letfreqsum = letfreqs_df.freq.sum()
for i in range(len(letfreqs_df)):
letter = letfreqs_df.letter.iloc[i]
freq = letfreqs_df.freq.iloc[i]
letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum
letfreqs_df.set_index('letter', drop=False, inplace=True)
print "%d seconds elapsed." % (time.time() - start)
letfreqs_df.to_pickle('gadsby_analysis_letfreqs.pickle')
else:
print 'Reading from pickle.'
letfreqs_df = pd.read_pickle('gadsby_analysis_letfreqs.pickle')
df = letfreqs_df.copy()
df.columns = ['gadsby_freq', 'letter', 'gadsby_pct']
df = df[['letter', 'gadsby_freq', 'gadsby_pct']]
print df.head()
Reading from pickle. letter gadsby_freq gadsby_pct letter a a 22105 11.139219 b b 4270 2.151751 c c 5217 2.628967 d d 8234 4.149302 e e 0 0.000000
redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes
brown = pd.read_pickle('brown_non_e.pickle')
if not os.path.isfile('brown_non_e_leftfreq.pickle') or redo_pickle == True:
start = time.time()
letfreq_dict = {}
for ltr in alphabet:
letfreq_dict[ltr] = 0
for i in range(len(words)):
wd = brown.word.iloc[i]
for ltr in wd:
letfreq_dict[ltr] += brown.freq.iloc[i]
letfreqs_df = pd.DataFrame()
for letter in alphabet:
temp = pd.DataFrame({'letter':[letter],
'freq':[letfreq_dict[letter]]})
letfreqs_df = letfreqs_df.append(temp, ignore_index=True)
letfreqs_df['normal'] = 0.0
letfreqsum = letfreqs_df.freq.sum()
for i in range(len(letfreqs_df)):
letter = letfreqs_df.letter.iloc[i]
freq = letfreqs_df.freq.iloc[i]
letfreqs_df.normal.iloc[i] = 100.0 * freq / letfreqsum
letfreqs_df.set_index('letter', drop=False, inplace=True)
print "%d seconds elapsed." % (time.time() - start)
letfreqs_df.to_pickle('brown_non_e_letfreqs.pickle')
else:
print 'Reading from pickle.'
letfreqs_df = pd.read_pickle('brown_non_e_letfreqs.pickle')
brown = letfreqs_df.copy()
brown.columns = ['brown_freq', 'letter', 'brown_pct']
brown = brown[['letter', 'brown_freq', 'brown_pct']]
print brown.head()
1 seconds elapsed. letter brown_freq brown_pct letter a a 208195 11.319178 b b 27813 1.512141 c c 48343 2.628320 d d 78828 4.285733 e e 0 0.000000
df = pd.merge(df, brown, how='inner', on='letter', left_index=True, right_index = False)
df['brown_normalized'] = (df.brown_freq * df.gadsby_freq.sum()/df.brown_freq.sum()).astype(int)
df['diff'] = df.gadsby_pct - df.brown_pct
df['ratio'] = df.gadsby_pct / df.brown_pct
def loglike(n1, t1, n2, t2):
"""Calculates Dunning log likelihood of an observation of
frequency n1 in a corpus of size t1, compared to a frequency n2
in a corpus of size t2. If result is positive, it is more
likely to occur in corpus 1, otherwise in corpus 2."""
e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
e2 = t2*1.0*(n1+n2)/(t1+t2)
LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
if n2*1.0/t2 > n1*1.0/t1:
LL = -LL
return LL
from numpy import log
t1 = df.gadsby_freq.sum()
t2 = df.brown_freq.sum()
df['log_likelihood'] = 0.0
for i in range(len(df)):
n1 = df.gadsby_freq.iloc[i]
n2 = df.brown_freq.iloc[i]
df.log_likelihood.iloc[i] = loglike(n1, t1, n2, t2)
df
letter | gadsby_freq | gadsby_pct | brown_freq | brown_pct | brown_normalized | diff | ratio | log_likelihood | |
---|---|---|---|---|---|---|---|---|---|
letter | |||||||||
a | a | 22105 | 11.139219 | 208195 | 11.319178 | 22462 | -0.179959 | 0.984101 | -5.154793 |
b | b | 4270 | 2.151751 | 27813 | 1.512141 | 3000 | 0.639610 | 1.422983 | 422.428550 |
c | c | 5217 | 2.628967 | 48343 | 2.628320 | 5215 | 0.000647 | 1.000246 | 0.000285 |
d | d | 8234 | 4.149302 | 78828 | 4.285733 | 8504 | -0.136431 | 0.968166 | -7.871319 |
e | e | 0 | 0.000000 | 0 | 0.000000 | 0 | 0.000000 | NaN | NaN |
f | f | 4368 | 2.201136 | 71896 | 3.908853 | 7756 | -1.707717 | 0.563116 | -1614.594975 |
g | g | 7175 | 3.615648 | 38652 | 2.101438 | 4170 | 1.514210 | 1.720559 | 1571.243993 |
h | h | 9836 | 4.956587 | 93001 | 5.056293 | 10033 | -0.099706 | 0.980281 | -3.547318 |
i | i | 17550 | 8.843849 | 178746 | 9.718090 | 19284 | -0.874241 | 0.910040 | -145.728668 |
j | j | 466 | 0.234828 | 3361 | 0.182731 | 362 | 0.052097 | 1.285100 | 24.170986 |
k | k | 2310 | 1.164062 | 12785 | 0.695097 | 1379 | 0.468965 | 1.674676 | 461.058467 |
l | l | 10449 | 5.265492 | 77191 | 4.196732 | 8328 | 1.068760 | 1.254665 | 447.169631 |
m | m | 4099 | 2.065581 | 41491 | 2.255789 | 4476 | -0.190209 | 0.915680 | -29.653479 |
n | n | 16926 | 8.529401 | 159318 | 8.661826 | 17188 | -0.132424 | 0.984712 | -3.646751 |
o | o | 20823 | 10.493189 | 224504 | 12.205868 | 24221 | -1.712679 | 0.859684 | -454.252165 |
p | p | 3710 | 1.869554 | 27267 | 1.482456 | 2941 | 0.387098 | 1.261119 | 165.729733 |
q | q | 103 | 0.051904 | 684 | 0.037188 | 73 | 0.014716 | 1.395728 | 9.165714 |
r | r | 9426 | 4.749979 | 81398 | 4.425459 | 8782 | 0.324519 | 1.073330 | 41.521963 |
s | s | 13389 | 6.747026 | 120158 | 6.532769 | 12963 | 0.214257 | 1.032797 | 12.438128 |
t | t | 16770 | 8.450789 | 177992 | 9.677097 | 19203 | -1.226307 | 0.873277 | -292.146957 |
u | u | 8338 | 4.201710 | 64735 | 3.519523 | 6984 | 0.682188 | 1.193830 | 221.523997 |
v | v | 621 | 0.312936 | 5171 | 0.281138 | 557 | 0.031798 | 1.113106 | 6.189885 |
w | w | 5633 | 2.838598 | 53064 | 2.884992 | 5725 | -0.046393 | 0.983919 | -1.344228 |
x | x | 87 | 0.043841 | 1117 | 0.060729 | 120 | -0.016888 | 0.721914 | -9.412268 |
y | y | 6333 | 3.191345 | 42703 | 2.321683 | 4607 | 0.869661 | 1.374582 | 515.920749 |
z | z | 205 | 0.103304 | 899 | 0.048877 | 96 | 0.054427 | 2.113556 | 79.507689 |
df.to_csv('gadsby_letter_analysis.csv')
print df.sort('log_likelihood', ascending=False).head(10)
letter gadsby_freq gadsby_pct brown_freq brown_pct \ letter g g 7175 3.615648 38652 2.101438 y y 6333 3.191345 42703 2.321683 k k 2310 1.164062 12785 0.695097 l l 10449 5.265492 77191 4.196732 b b 4270 2.151751 27813 1.512141 u u 8338 4.201710 64735 3.519523 p p 3710 1.869554 27267 1.482456 z z 205 0.103304 899 0.048877 r r 9426 4.749979 81398 4.425459 j j 466 0.234828 3361 0.182731 brown_normalized diff ratio log_likelihood letter g 4170 1.514210 1.720559 1571.243993 y 4607 0.869661 1.374582 515.920749 k 1379 0.468965 1.674676 461.058467 l 8328 1.068760 1.254665 447.169631 b 3000 0.639610 1.422983 422.428550 u 6984 0.682188 1.193830 221.523997 p 2941 0.387098 1.261119 165.729733 z 96 0.054427 2.113556 79.507689 r 8782 0.324519 1.073330 41.521963 j 362 0.052097 1.285100 24.170986
print df.sort('log_likelihood', ascending=True).head(20)
letter gadsby_freq gadsby_pct brown_freq brown_pct \ letter f f 4368 2.201136 71896 3.908853 o o 20823 10.493189 224504 12.205868 t t 16770 8.450789 177992 9.677097 i i 17550 8.843849 178746 9.718090 m m 4099 2.065581 41491 2.255789 x x 87 0.043841 1117 0.060729 d d 8234 4.149302 78828 4.285733 a a 22105 11.139219 208195 11.319178 n n 16926 8.529401 159318 8.661826 h h 9836 4.956587 93001 5.056293 w w 5633 2.838598 53064 2.884992 c c 5217 2.628967 48343 2.628320 v v 621 0.312936 5171 0.281138 q q 103 0.051904 684 0.037188 s s 13389 6.747026 120158 6.532769 j j 466 0.234828 3361 0.182731 r r 9426 4.749979 81398 4.425459 z z 205 0.103304 899 0.048877 p p 3710 1.869554 27267 1.482456 u u 8338 4.201710 64735 3.519523 brown_normalized diff ratio log_likelihood letter f 7756 -1.707717 0.563116 -1614.594975 o 24221 -1.712679 0.859684 -454.252165 t 19203 -1.226307 0.873277 -292.146957 i 19284 -0.874241 0.910040 -145.728668 m 4476 -0.190209 0.915680 -29.653479 x 120 -0.016888 0.721914 -9.412268 d 8504 -0.136431 0.968166 -7.871319 a 22462 -0.179959 0.984101 -5.154793 n 17188 -0.132424 0.984712 -3.646751 h 10033 -0.099706 0.980281 -3.547318 w 5725 -0.046393 0.983919 -1.344228 c 5215 0.000647 1.000246 0.000285 v 557 0.031798 1.113106 6.189885 q 73 0.014716 1.395728 9.165714 s 12963 0.214257 1.032797 12.438128 j 362 0.052097 1.285100 24.170986 r 8782 0.324519 1.073330 41.521963 z 96 0.054427 2.113556 79.507689 p 2941 0.387098 1.261119 165.729733 u 6984 0.682188 1.193830 221.523997
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn
x = list(df.brown_pct)
y = list(df.gadsby_pct)
s = list(df.gadsby_freq / 10)
fig, ax = plt.subplots(figsize=(12,9))
#plt.style.use('bmh')
ax.scatter(x,y,s=s,c='b',alpha=0.3, marker='o', color='steelblue')
plt.plot([0,13], [0,13], linestyle='-', marker='None', color='r', markersize=0)
ax.set_xlim(0, 13)
ax.set_ylim(0, 13)
ax.set_ylabel("Percent frequency in Gadsby", fontsize=14)
ax.set_xlabel("Percent frequency in Brown without 'e'-containing words", fontsize=14)
ax.set_title("Comparison of letter frequencies in Gadsby, a novel without\nthe letter 'e', \
and the Brown corpus with 'e'-containing words removed", fontsize=18)
txt = []
for i, ltr in enumerate(alphabet):
if ltr != 'e':
txt.append( ax.text(x[i], y[i], ltr, ha="center", va="center", rotation=0,
size=s[i]/40 ) )
plt.show()
import plotly.plotly as py
from plotly.graph_objs import *
#fill this in with your username and api key
py.sign_in(open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_emanresu.txt', 'r').read(),
open(r'C:\Users\David\Documents\Dropbox\IPython_Synced\plotly_yekipa.txt', 'r').read())
data = Data([
Bar(
x=list(df.letter),
y=list(df.log_likelihood)
)])
layout=Layout(
title="Comparison of letter frequencies in Gadsby, a novel without the letter 'e', and \
the Brown corpus with 'e'-containing words removed",
yaxis=YAxis(
title='Log Likelihood keyness'
)
)
#plot_url = py.plot(data, layout=layout, filename='gadsby_letter_keyness')
py.iplot(data, layout=layout, filename='gadsby_letter_keyness')