dataframe_base = 'coha_words' # change as needed to point to pickle data_path = 'data' nb_path = 'letter_proximity' save_filename = '' #used for .pickle and .png, leave as '' to use a default filename import pandas as pd import os import time plotly_apikey = open("../../plotly_yekipa.txt", "r").read() plotly_username = open("../../plotly_emanresu.txt", "r").read() words = pd.read_pickle(data_path + '/' + dataframe_base + '.pickle') alphabet = '_abcdefghijklmnopqrstuvwxyz' # note: underscore(_) is a null value, i.e. '_a' signifies a word that starts with the letter a def increment_dict(d, key, increment=1): if key in d.keys(): d[key] += increment else: d[key] = increment return d possible_bigrams = [] for let1 in alphabet: for let2 in alphabet: if not (let1 == '_' and let2=='_'): possible_bigrams.append(let1+let2) possible_trigrams = [] # for later implementation for bigram in possible_bigrams: for letter in alphabet: if bigram[-1] != '_': possible_trigrams.append(bigram+letter) class progress_bar: def __init__(self, loop_length): import time self.start = time.time() self.increment_size = 100.0/loop_length self.curr_count = 0 self.curr_pct = 0 self.finished = False self.overflow = False print '% complete:', def increment(self): self.curr_count += self.increment_size if int(self.curr_count) > self.curr_pct: self.curr_pct = int(self.curr_count) if self.finished == False: if self.curr_pct <= 99: print self.curr_pct, elif self.curr_pct == 100: print "100" self.finished = True elif self.overflow == False: print "***** Count has gone over 100%; likely either due to an error in the loop_length specified when " + \ "progress_bar was instantiated or in the placement of the increment function *****" self.overflow = True else: self.finished = True if self.overflow == False and self.finished == True: print 'Elapsed time: %0.1f seconds.' % (time.time() - self.start) self.overflow = True redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes if not os.path.isdir(nb_path): os.mkdir(nb_path) if not os.path.isfile(nb_path+'/'+dataframe_base+'_bigrams.pickle') or redo_pickle == True: start = time.time() bigrams = {} progbar = progress_bar(len(words)) for i in range(len(words)): progbar.increment() wd = '_' + words.word.iloc[i] + '_' freq = words.freq.iloc[i] stop_bigrams = len(wd) -2 for pos in range(len(wd)): if pos <= stop_bigrams: increment_dict(bigrams, wd[pos:pos+2], freq) df_bigrams = pd.DataFrame() for key in bigrams.keys(): df_bigrams = df_bigrams.append(pd.DataFrame({'letter_1':[key[0]], 'letter_2':[key[1]], 'freq':[bigrams[key]]}, index=[key])) df_trigrams = pd.DataFrame() bigrams_total = df_bigrams.freq.sum() def calc_pct_freq(df): return pd.Series({'pct_freq': df.freq * 100.0 / bigrams_total}) df_bigrams = df_bigrams.merge(df_bigrams.apply(calc_pct_freq, axis=1), left_index=True, right_index=True) df_bigrams.to_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle') else: df_bigrams=pd.read_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle') print "Pickle loaded." print df_bigrams.head() # calculate conditional probabilities df_bi_cond = pd.DataFrame() for bigram in possible_bigrams: let1, let2 = (bigram[0], bigram[1]) if bigram in df_bigrams.index: prob_1_given_2 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_2 == let2].freq.sum() prob_2_given_1 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_1 == let1].freq.sum() df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[prob_1_given_2], 'pct_2_given_1':[prob_2_given_1]}, index=[bigram])) else: df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[0.0], 'pct_2_given_1':[0.0]}, index=[bigram])) print df_bi_cond.head() df_bi_cond.to_csv(nb_path + '/bi_cond' + dataframe_base + '.csv') # create pivot table - no longer necessary df_pivot_2g1 = pd.pivot_table(data=df_bi_cond, values='pct_2_given_1', index='let1', columns='let2').fillna(0) print df_pivot_2g1.head() df_pivot_1g2 = pd.pivot_table(data=df_bi_cond, values='pct_1_given_2', index='let1', columns='let2').fillna(0)