Notebook

word_list_tools repo¶

by David Taylor, www.prooffreader.com, prooffreader@gmail.com

a collection of tools to create and analyze lists of words using python with pandas and matplotlib

letter proximity¶

determine frequency of bigrams and conditional letter frequency in a corpus before and after a given letter

source word list is pandas dataframe with columns 'word' and 'freq'. Any other columns will be ignored.

** initial_data_munge must be run first to create pickled dataframes of word lists **

In [1]:

dataframe_base = 'coha_words' # change as needed to point to pickle

data_path = 'data'
nb_path = 'letter_proximity'

save_filename = '' #used for .pickle and .png, leave as '' to use a default filename

import pandas as pd
import os
import time

plotly_apikey = open("../../plotly_yekipa.txt", "r").read()
plotly_username = open("../../plotly_emanresu.txt", "r").read()

words = pd.read_pickle(data_path + '/' + dataframe_base + '.pickle')

alphabet = '_abcdefghijklmnopqrstuvwxyz'
# note: underscore(_) is a null value, i.e. '_a' signifies a word that starts with the letter a

def increment_dict(d, key, increment=1):
    if key in d.keys():
        d[key] += increment
    else:
        d[key] = increment
    return d

possible_bigrams = []
for let1 in alphabet:
    for let2 in alphabet:
        if not (let1 == '_' and let2=='_'):
            possible_bigrams.append(let1+let2)
            
possible_trigrams = [] # for later implementation
for bigram in possible_bigrams:
    for letter in alphabet:
        if bigram[-1] != '_':
            possible_trigrams.append(bigram+letter)

In [2]:

class progress_bar: 
    def __init__(self, loop_length):
        import time
        self.start = time.time()
        self.increment_size = 100.0/loop_length
        self.curr_count = 0
        self.curr_pct = 0
        self.finished = False
        self.overflow = False
        print '% complete:',
    
    def increment(self):
        self.curr_count += self.increment_size
        if int(self.curr_count) > self.curr_pct:
            self.curr_pct = int(self.curr_count)
            if self.finished == False:
                if self.curr_pct <= 99:
                    print self.curr_pct,
                elif self.curr_pct == 100:
                    print "100"
                    self.finished = True  
                elif self.overflow == False:
                    print "***** Count has gone over 100%; likely either due to an error in the loop_length specified when " + \
                          "progress_bar was instantiated or in the placement of the increment function *****"
                    self.overflow = True
                else:
                    self.finished = True
            if self.overflow == False and self.finished == True:
                print 'Elapsed time: %0.1f seconds.' % (time.time() - self.start)
                self.overflow = True
                

In [3]:

redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes

if not os.path.isdir(nb_path):    
    os.mkdir(nb_path)
    
if not os.path.isfile(nb_path+'/'+dataframe_base+'_bigrams.pickle') or redo_pickle == True:    
    
    start = time.time()
    
    bigrams = {}
    
    progbar = progress_bar(len(words))
    
    for i in range(len(words)):
        progbar.increment()
        wd = '_' + words.word.iloc[i] + '_'
        freq = words.freq.iloc[i]
        stop_bigrams = len(wd) -2
        for pos in range(len(wd)):
            if pos <= stop_bigrams:
                increment_dict(bigrams, wd[pos:pos+2], freq)            
                    
    df_bigrams = pd.DataFrame()
    for key in bigrams.keys():
        df_bigrams = df_bigrams.append(pd.DataFrame({'letter_1':[key[0]],
            'letter_2':[key[1]], 'freq':[bigrams[key]]}, index=[key]))
    df_trigrams = pd.DataFrame()
    bigrams_total = df_bigrams.freq.sum()

    def calc_pct_freq(df):
        return pd.Series({'pct_freq': df.freq * 100.0 / 
                          bigrams_total})

    df_bigrams = df_bigrams.merge(df_bigrams.apply(calc_pct_freq, axis=1), left_index=True, right_index=True)
 
    df_bigrams.to_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')
    
else:
    
    df_bigrams=pd.read_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')
    print "Pickle loaded."
    
print df_bigrams.head()

% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99         freq letter_1 letter_2  pct_freq
gw     17983        g        w  0.000845
t_  39024960        t        _  1.833252
gu   1028478        g        u  0.048314
gt    213703        g        t  0.010039
gs    791136        g        s  0.037165

In [4]:

# calculate conditional probabilities

df_bi_cond = pd.DataFrame()
for bigram in possible_bigrams:
    let1, let2 = (bigram[0], bigram[1])
    if bigram in df_bigrams.index:
        prob_1_given_2 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_2 == let2].freq.sum()
        prob_2_given_1 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_1 == let1].freq.sum()
        df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[prob_1_given_2], 
                                                     'pct_2_given_1':[prob_2_given_1]}, index=[bigram]))
    else:
        df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[0.0], 
                                                     'pct_2_given_1':[0.0]}, index=[bigram]))
        
print df_bi_cond.head()

df_bi_cond.to_csv(nb_path + '/bi_cond' + dataframe_base + '.csv')

   let1 let2  pct_1_given_2  pct_2_given_1
_a    _    a      31.379124      11.291626
_b    _    b      68.037261       4.682944
_c    _    c      35.925454       4.386648
_d    _    d      17.032289       3.163017
_e    _    e       3.946567       2.200153

In [5]:

# create pivot table - no longer necessary

df_pivot_2g1 = pd.pivot_table(data=df_bi_cond, values='pct_2_given_1', index='let1', columns='let2').fillna(0)
print df_pivot_2g1.head()
df_pivot_1g2 = pd.pivot_table(data=df_bi_cond, values='pct_1_given_2', index='let1', columns='let2').fillna(0)

let2          _          a         b         c         d          e         f  \
let1                                                                            
_      0.000000  11.819304  4.651243  4.805972  2.928036   2.448977  4.103325   
a      7.624876   0.013944  2.131876  4.320334  4.275015   0.086080  0.960819   
b      1.396656   8.525391  0.775292  0.038129  0.056488  31.217873  0.001412   
c      3.487082  12.994243  0.016849  1.689834  0.025274  15.614996  0.010531   
d     57.425042   3.132064  0.056374  0.052505  0.961671  14.826872  0.038688   

let2         g          h         i    ...            q          r         s  \
let1                                   ...                                     
_     1.704057   5.447910  6.829877    ...     0.199549   2.624486  6.902913   
a     1.986265   0.117454  3.466779    ...     0.015821  10.338392  9.232230   
b     0.016946   0.012710  5.240637    ...     0.000000   5.726431  1.957295   
c     0.004914  14.851867  6.103623    ...     0.101095   3.648554  0.496349   
d     0.602979   0.083455  8.830243    ...     0.037030   1.872496  2.464421   

let2          t          u         v         w         x         y         z  
let1                                                                          
_     16.208252   1.157056  0.653244  6.130388  0.003463  0.822234  0.022002  
a     13.989397   1.089268  2.066981  0.666647  0.197098  2.526340  0.161433  
b      0.704683  11.321527  0.302209  0.021183  0.000000  8.053720  0.000000  
c      9.802724   3.235748  0.000702  0.002106  0.000000  0.815782  0.010531  
d      0.040899   2.461105  0.335480  0.166358  0.000000  1.226407  0.001658  

[5 rows x 27 columns]

In [ ]: