by David Taylor, www.prooffreader.com, prooffreader@gmail.com
a collection of tools to create and analyze lists of words using python with pandas and matplotlib
determine frequency of bigrams and conditional letter frequency in a corpus before and after a given letter
source word list is pandas dataframe with columns 'word' and 'freq'. Any other columns will be ignored.
** initial_data_munge must be run first to create pickled dataframes of word lists **
dataframe_base = 'coha_words' # change as needed to point to pickle
data_path = 'data'
nb_path = 'letter_proximity'
save_filename = '' #used for .pickle and .png, leave as '' to use a default filename
import pandas as pd
import os
import time
plotly_apikey = open("../../plotly_yekipa.txt", "r").read()
plotly_username = open("../../plotly_emanresu.txt", "r").read()
words = pd.read_pickle(data_path + '/' + dataframe_base + '.pickle')
alphabet = '_abcdefghijklmnopqrstuvwxyz'
# note: underscore(_) is a null value, i.e. '_a' signifies a word that starts with the letter a
def increment_dict(d, key, increment=1):
if key in d.keys():
d[key] += increment
else:
d[key] = increment
return d
possible_bigrams = []
for let1 in alphabet:
for let2 in alphabet:
if not (let1 == '_' and let2=='_'):
possible_bigrams.append(let1+let2)
possible_trigrams = [] # for later implementation
for bigram in possible_bigrams:
for letter in alphabet:
if bigram[-1] != '_':
possible_trigrams.append(bigram+letter)
class progress_bar:
def __init__(self, loop_length):
import time
self.start = time.time()
self.increment_size = 100.0/loop_length
self.curr_count = 0
self.curr_pct = 0
self.finished = False
self.overflow = False
print '% complete:',
def increment(self):
self.curr_count += self.increment_size
if int(self.curr_count) > self.curr_pct:
self.curr_pct = int(self.curr_count)
if self.finished == False:
if self.curr_pct <= 99:
print self.curr_pct,
elif self.curr_pct == 100:
print "100"
self.finished = True
elif self.overflow == False:
print "***** Count has gone over 100%; likely either due to an error in the loop_length specified when " + \
"progress_bar was instantiated or in the placement of the increment function *****"
self.overflow = True
else:
self.finished = True
if self.overflow == False and self.finished == True:
print 'Elapsed time: %0.1f seconds.' % (time.time() - self.start)
self.overflow = True
redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes
if not os.path.isdir(nb_path):
os.mkdir(nb_path)
if not os.path.isfile(nb_path+'/'+dataframe_base+'_bigrams.pickle') or redo_pickle == True:
start = time.time()
bigrams = {}
progbar = progress_bar(len(words))
for i in range(len(words)):
progbar.increment()
wd = '_' + words.word.iloc[i] + '_'
freq = words.freq.iloc[i]
stop_bigrams = len(wd) -2
for pos in range(len(wd)):
if pos <= stop_bigrams:
increment_dict(bigrams, wd[pos:pos+2], freq)
df_bigrams = pd.DataFrame()
for key in bigrams.keys():
df_bigrams = df_bigrams.append(pd.DataFrame({'letter_1':[key[0]],
'letter_2':[key[1]], 'freq':[bigrams[key]]}, index=[key]))
df_trigrams = pd.DataFrame()
bigrams_total = df_bigrams.freq.sum()
def calc_pct_freq(df):
return pd.Series({'pct_freq': df.freq * 100.0 /
bigrams_total})
df_bigrams = df_bigrams.merge(df_bigrams.apply(calc_pct_freq, axis=1), left_index=True, right_index=True)
df_bigrams.to_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')
else:
df_bigrams=pd.read_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')
print "Pickle loaded."
print df_bigrams.head()
% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 freq letter_1 letter_2 pct_freq gw 17983 g w 0.000845 t_ 39024960 t _ 1.833252 gu 1028478 g u 0.048314 gt 213703 g t 0.010039 gs 791136 g s 0.037165
# calculate conditional probabilities
df_bi_cond = pd.DataFrame()
for bigram in possible_bigrams:
let1, let2 = (bigram[0], bigram[1])
if bigram in df_bigrams.index:
prob_1_given_2 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_2 == let2].freq.sum()
prob_2_given_1 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_1 == let1].freq.sum()
df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[prob_1_given_2],
'pct_2_given_1':[prob_2_given_1]}, index=[bigram]))
else:
df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[0.0],
'pct_2_given_1':[0.0]}, index=[bigram]))
print df_bi_cond.head()
df_bi_cond.to_csv(nb_path + '/bi_cond' + dataframe_base + '.csv')
let1 let2 pct_1_given_2 pct_2_given_1 _a _ a 31.379124 11.291626 _b _ b 68.037261 4.682944 _c _ c 35.925454 4.386648 _d _ d 17.032289 3.163017 _e _ e 3.946567 2.200153
# create pivot table - no longer necessary
df_pivot_2g1 = pd.pivot_table(data=df_bi_cond, values='pct_2_given_1', index='let1', columns='let2').fillna(0)
print df_pivot_2g1.head()
df_pivot_1g2 = pd.pivot_table(data=df_bi_cond, values='pct_1_given_2', index='let1', columns='let2').fillna(0)
let2 _ a b c d e f \ let1 _ 0.000000 11.819304 4.651243 4.805972 2.928036 2.448977 4.103325 a 7.624876 0.013944 2.131876 4.320334 4.275015 0.086080 0.960819 b 1.396656 8.525391 0.775292 0.038129 0.056488 31.217873 0.001412 c 3.487082 12.994243 0.016849 1.689834 0.025274 15.614996 0.010531 d 57.425042 3.132064 0.056374 0.052505 0.961671 14.826872 0.038688 let2 g h i ... q r s \ let1 ... _ 1.704057 5.447910 6.829877 ... 0.199549 2.624486 6.902913 a 1.986265 0.117454 3.466779 ... 0.015821 10.338392 9.232230 b 0.016946 0.012710 5.240637 ... 0.000000 5.726431 1.957295 c 0.004914 14.851867 6.103623 ... 0.101095 3.648554 0.496349 d 0.602979 0.083455 8.830243 ... 0.037030 1.872496 2.464421 let2 t u v w x y z let1 _ 16.208252 1.157056 0.653244 6.130388 0.003463 0.822234 0.022002 a 13.989397 1.089268 2.066981 0.666647 0.197098 2.526340 0.161433 b 0.704683 11.321527 0.302209 0.021183 0.000000 8.053720 0.000000 c 9.802724 3.235748 0.000702 0.002106 0.000000 0.815782 0.010531 d 0.040899 2.461105 0.335480 0.166358 0.000000 1.226407 0.001658 [5 rows x 27 columns]