word_list_tools repo

by David Taylor, www.prooffreader.com, [email protected]

a collection of tools to create and analyze lists of words using python with pandas and matplotlib

letter_distributions

determine letter distributions within the word based on a word list with frequencies

word list is pandas dataframe with columns 'word' and 'freq'. Any other columns will be ignored.

initial_data_munge must be run first to create pickled dataframes of word lists

In [1]:
dataframe_base = 'brown_words' # change as needed to point to pickle
dataframe_description = 'Brown Corpus from Natural Language Toolkit'

data_path = 'data'
nb_path = 'letter_distributions'

save_filename = '' #used for .pickle and .png, leave as '' to use a default filename

import pandas as pd
import os

words = pd.read_pickle(data_path + "/" + dataframe_base + ".pickle")
create letters dataframe, with frequency of letters in number of bins determined by user
In [2]:
b_len = 15 #number of bins, decided by user

if save_filename = '':
    letters_pickle = nb_path + '/' + 'letters_' + dataframe_base + '_' + str(b_len) + '.pickle'
else:
    letters_pickle = nb_path + '/' + save_filename + '.pickle'

if not os.path.isfile(letters_pickle):

    print 'Calculating letters dataframe.'
    p_step = b_len # to facilitate readability; cross product
    
    # dataframe for results; z is just a temporary list to facilitate dataframe initialization
    z = [0] * b_len
    letters = pd.DataFrame({'a': z, 'b': z, 'c': z, 'd': z, 'e': z, 'f': z, 'g': z, 
                            'h': z, 'i': z, 'j': z, 'k': z, 'l': z, 'm': z, 'n': z, 
                            'o': z, 'p': z, 'q': z, 'r': z, 's': z, 't': z, 'u': z, 
                            'v': z, 'w': z, 'x': z, 'y': z, 'z': z})
    
    for i in range(len(words)):
        freq = words.freq.iloc[i]
        wd = words.word.iloc[i]
        p_len = len(wd)
        b_step = p_len
        bp_mult = b_len * p_len #use multiple instead of range of 0 to 1 (or 0 to 100) to avoid floats not adding together exactly
        b_curnum = 0 # current bin
        p_curnum = 0 # current letter
        curmult = 0 # current position of algorithm from 0 to bp_mult
        temp = 0
    
        if p_len > 1:
            while curmult < bp_mult:
                temp += 1
                overlap = min((b_curnum + 1) * b_step, (p_curnum + 1) * p_step) - curmult
                #try:
                letters[wd[p_curnum]][b_curnum] += freq * overlap / bp_mult
                curmult += overlap
                if (b_curnum + 1) * b_step == curmult:
                    b_curnum += 1
                if (p_curnum + 1) * p_step == curmult:
                    p_curnum += 1
                if temp >= 100:
                    print "Error; more than 100 iterations on word " + wd
                    break
        
        letters.to_pickle(letters_pickle)

else:
    print 'Reading from pickle.'
    letters = pd.read_pickle(letters_pickle)
Reading from pickle.

Dataframes with bins as rows and letters as columns. Bin number equals row number, so iloc can be used to look up values.

* letters (raw frequencies)
* letters_norm (frequencies normalized so that each letter's maximum has a value of 100; integers, not floats)
* letters_equal_area (letters_norm adjusted so that the area under the slope is  the same for all graphs)

Dataframe 'letters_stats' has statistics for each letter, the row indexes are the letters. Columns are:

* max_freq: maximum raw frequency of each letter
* max_bin: bin where max_freq occurs
* total_freq: the total frequency of each letter
* pct_freq: the total frequency as a percent of all letters; for representative English word lists, e is the top letter at about 12 percent
* area_under_norm: area under normalized lines

Dict 'letters_overall' has statistics for the entire dataset;

* max_freq: the maximum raw frequency of any letter
* max_letter: the letter with the maximum raw frequency
* total_freq:
* max_pct:

List 'colors' is assigned by user, with nested lists of lower boundary (the first should normally be zero) and hex color string. The bins follow the usual [low, high) python boundaries. The maximum value is calculated for the user.
In [3]:
colors = [[0, '#ffffcc'],
          [0.1, '#ffeda0'],
          [0.5, '#fed976'],
          [1, '#feb24c'],
          [2, '#fd8d3c'],
          [3, '#fc4e2a'],
          [5, '#e31a1c'],
          [9, '#b10026']]
In [4]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
    
letters_norm = letters.copy() # note that values are kept as integers for now; the graphs are narrow enough that it should not matter
letters_equal_area = letters.copy() 
letters_overall = {}

letters_stats = pd.DataFrame({'max_freq': [0] * 26}, index=list(alphabet))
letters_stats['max_bin'] = 0
letters_stats['total_freq'] = 0
letters_stats['pct_freq'] = 0.0
letters_stats['norm_area'] = 0
letters_stats['color'] = ''

for ltr in alphabet:
    letters_stats.max_freq.ix[ltr] = letters[ltr].max()
    letters_stats.max_bin.ix[ltr] = letters[letters[ltr] == letters_stats.max_freq.ix[ltr]].index[0]
    letters_stats.total_freq.ix[ltr] = letters[ltr].sum()

letters_overall['max_freq'] = letters_stats.max_freq.max()
letters_overall['total_freq'] = letters_stats.total_freq.sum()
letters_overall['max_letter'] = letters_stats[letters_stats.max_freq == letters_overall['max_freq']].iloc[0].name

for ltr in alphabet:
    letters_stats.pct_freq.ix[ltr] = (letters_stats.total_freq.ix[ltr] * 100.0
                                      / letters_overall['total_freq'])
    for rw in range(len(letters_norm)):
        letters_norm[ltr].iloc[rw] *= 100
        letters_norm[ltr].iloc[rw] /= letters_stats['max_freq'].ix[ltr]
    
letters_overall['max_pct'] = letters_stats.pct_freq.max()

for ltr in alphabet:
    # assign colors based on pct_max and color list
    color = ''
    for i in range(len(colors)):
        if letters_stats.pct_freq.ix[ltr] >= colors[i][0]:
             color = colors[i][1]
    letters_stats.color.ix[ltr] = color
    # calculate area under norm lines
    area = 0
    for rw in range(len(letters_norm) - 1):
        height0 = letters_norm[ltr].iloc[rw]
        height1 = letters_norm[ltr].iloc[rw+1]
        area += min(height0, height1)
        area += 0.5 * abs(height1 - height0)
    letters_stats.norm_area[ltr] = area

letters_overall['max_area'] = letters_stats.norm_area.max()    
letters_overall['max_equal_area'] = 0

for ltr in alphabet:
    for rw in range(len(letters_equal_area)):
        letters_equal_area[ltr].iloc[rw] = (letters_norm[ltr].iloc[rw] * 
                                            letters_overall['max_area'] / letters_stats.norm_area[ltr])
    letters_overall['max_equal_area'] = max(letters_overall['max_equal_area'], letters_equal_area[ltr].max())

#rescale to 100
for ltr in alphabet:
    for rw in range(len(letters_equal_area)):
        letters_equal_area[ltr].iloc[rw] *= 100
        letters_equal_area[ltr].iloc[rw] /= letters_overall['max_equal_area']
        
import math
letters_overall['max_pct_for_legend'] = int(math.ceil(letters_overall['max_pct']))

letters_overall['max_compromise'] = 0
letters_compromise = letters_norm.copy()
for ltr in alphabet:
    for rw in range(len(letters_equal_area)):
        letters_compromise[ltr].iloc[rw] = (letters_norm[ltr].iloc[rw] + letters_equal_area[ltr].iloc[rw]) / 2
    letters_overall['max_compromise'] = max(letters_overall['max_compromise'], letters_compromise[ltr].max())
    
#rescale to 100
for ltr in alphabet:
    for rw in range(len(letters_equal_area)):
        letters_compromise[ltr].iloc[rw] *= 100
        letters_compromise[ltr].iloc[rw] /= letters_overall['max_compromise']

Print parts of dataframes for sanity check:

In [5]:
print "letters['a']:"
print letters['a']
print "\nletters_norm['a']:"
print letters_norm['a']
print "\nletters_equal_area['a']:"
print letters_equal_area['a']
print "\nletters_compromise['a']:"
print letters_compromise['a']
print "\nletters_stats:"
print letters_stats
print "\nLetters_overall:"
print letters_overall
letters['a']:
0     5555
1     5379
2     5436
3     6069
4     7241
5     6180
6     6195
7     5307
8     4705
9     4417
10    2505
11    1460
12     816
13     268
14     178
Name: a, dtype: int64

letters_norm['a']:
0      76
1      74
2      75
3      83
4     100
5      85
6      85
7      73
8      64
9      60
10     34
11     20
12     11
13      3
14      2
Name: a, dtype: int64

letters_equal_area['a']:
0     34
1     33
2     33
3     37
4     45
5     38
6     38
7     33
8     29
9     27
10    15
11     8
12     4
13     1
14     0
Name: a, dtype: int64

letters_compromise['a']:
0     55
1     53
2     54
3     60
4     72
5     61
6     61
7     53
8     46
9     43
10    24
11    14
12     7
13     2
14     1
Name: a, dtype: int64

letters_stats:
   max_freq  max_bin  total_freq   pct_freq  norm_area    color
a      7241        4       61711   7.763015        806  #e31a1c
b      2510        0       13683   1.721271        489  #feb24c
c      2204        0       14703   1.849583        607  #feb24c
d      5739       14       29344   3.691366        444  #fc4e2a
e     12911       12       96941  12.194818        692  #b10026
f      2883        9       31145   3.917925        986  #fc4e2a
g      1324       14        9848   1.238842        655  #feb24c
h      9057        5       61327   7.714709        642  #e31a1c
i      6988        6       53692   6.754254        736  #e31a1c
j       244        0        1044   0.131331        367  #ffeda0
k       489       14        4313   0.542559        799  #fed976
l      2410       10       23446   2.949420        906  #fd8d3c
m      1999        0       16735   2.105201        757  #fd8d3c
n      6193        9       57395   7.220078        869  #e31a1c
o      8328        5       79843  10.043953        910  #b10026
p      1804        0        9730   1.223998        474  #feb24c
q        91        0         415   0.052205        401  #ffffcc
r      3563       10       37179   4.676980        973  #fc4e2a
s      6166       14       46107   5.800090        663  #e31a1c
t     10132        0       88497  11.132594        790  #b10026
u      2279        6       17318   2.178540        736  #fd8d3c
v       697        8        5615   0.706346        777  #fed976
w      3744        0       19512   2.454537        457  #fd8d3c
x       198        2         898   0.112965        433  #ffeda0
y      3037       14       14292   1.797881        407  #feb24c
z        39        8         203   0.025537        504  #ffffcc

Letters_overall:
{'total_freq': 794936, 'max_area': 986, 'max_equal_area': 268, 'max_letter': 'e', 'max_freq': 12911, 'max_compromise': 100, 'max_pct_for_legend': 13, 'max_pct': 12.194818199200942}
In [6]:
save_plot = True

column_list = list('abcdefghijklmnopqrstuvwxyz')
x_length = b_len

import matplotlib.pyplot as plt

fig, axes = plt.subplots(26, 1, figsize=(12, 90))
#plt.title("Title", size=18, color='k')

for pos in range(len(column_list)):
    ltr = column_list[pos]
    axes[pos].plot(range(x_length), letters_compromise[ltr], color='k', linewidth = 3, label = ltr)
    axes[pos].set_ylim(0,100)  
    fill_color = letters_stats['color'].ix[ltr]
    axes[pos].fill_between(range(x_length), letters_compromise[ltr], color=fill_color, interpolate=True)
    axes[pos].set_xticks([])
    axes[pos].set_yticks([])
    axes[pos].set_xticklabels([], size=0)
    axes[pos].set_yticklabels([])
    axes[pos].get_xaxis().set_visible(False)
    axes[pos].set_ylabel(ltr+'       ', size=24, rotation='horizontal')
    plt.subplots_adjust(hspace=0.1)

if save_filename != '':
    plot_name = nb_path + '/' + save_filename + '.png'
else:
    plot_name = nb_path + '/letters_' + dataframe_base + '_' + str(b_len) + '.png'

if save_plot == True:
    plt.savefig(plot_name)
In [ ]: