dataframe_base = 'coha_words' # change as needed to point to pickle

data_path = 'data'
nb_path = 'letter_proximity'

save_filename = '' #used for .pickle and .png, leave as '' to use a default filename

import pandas as pd
import os
import time

plotly_apikey = open("../../plotly_yekipa.txt", "r").read()
plotly_username = open("../../plotly_emanresu.txt", "r").read()

words = pd.read_pickle(data_path + '/' + dataframe_base + '.pickle')

alphabet = '_abcdefghijklmnopqrstuvwxyz'
# note: underscore(_) is a null value, i.e. '_a' signifies a word that starts with the letter a

def increment_dict(d, key, increment=1):
    if key in d.keys():
        d[key] += increment
    else:
        d[key] = increment
    return d

possible_bigrams = []
for let1 in alphabet:
    for let2 in alphabet:
        if not (let1 == '_' and let2=='_'):
            possible_bigrams.append(let1+let2)
            
possible_trigrams = [] # for later implementation
for bigram in possible_bigrams:
    for letter in alphabet:
        if bigram[-1] != '_':
            possible_trigrams.append(bigram+letter)

class progress_bar: 
    def __init__(self, loop_length):
        import time
        self.start = time.time()
        self.increment_size = 100.0/loop_length
        self.curr_count = 0
        self.curr_pct = 0
        self.finished = False
        self.overflow = False
        print '% complete:',
    
    def increment(self):
        self.curr_count += self.increment_size
        if int(self.curr_count) > self.curr_pct:
            self.curr_pct = int(self.curr_count)
            if self.finished == False:
                if self.curr_pct <= 99:
                    print self.curr_pct,
                elif self.curr_pct == 100:
                    print "100"
                    self.finished = True  
                elif self.overflow == False:
                    print "***** Count has gone over 100%; likely either due to an error in the loop_length specified when " + \
                          "progress_bar was instantiated or in the placement of the increment function *****"
                    self.overflow = True
                else:
                    self.finished = True
            if self.overflow == False and self.finished == True:
                print 'Elapsed time: %0.1f seconds.' % (time.time() - self.start)
                self.overflow = True
                

redo_pickle = False #change to true to overwrite existing pickle if the underlying data changes

if not os.path.isdir(nb_path):    
    os.mkdir(nb_path)
    
if not os.path.isfile(nb_path+'/'+dataframe_base+'_bigrams.pickle') or redo_pickle == True:    
    
    start = time.time()
    
    bigrams = {}
    
    progbar = progress_bar(len(words))
    
    for i in range(len(words)):
        progbar.increment()
        wd = '_' + words.word.iloc[i] + '_'
        freq = words.freq.iloc[i]
        stop_bigrams = len(wd) -2
        for pos in range(len(wd)):
            if pos <= stop_bigrams:
                increment_dict(bigrams, wd[pos:pos+2], freq)            
                    
    df_bigrams = pd.DataFrame()
    for key in bigrams.keys():
        df_bigrams = df_bigrams.append(pd.DataFrame({'letter_1':[key[0]],
            'letter_2':[key[1]], 'freq':[bigrams[key]]}, index=[key]))
    df_trigrams = pd.DataFrame()
    bigrams_total = df_bigrams.freq.sum()

    def calc_pct_freq(df):
        return pd.Series({'pct_freq': df.freq * 100.0 / 
                          bigrams_total})

    df_bigrams = df_bigrams.merge(df_bigrams.apply(calc_pct_freq, axis=1), left_index=True, right_index=True)
 
    df_bigrams.to_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')
    
else:
    
    df_bigrams=pd.read_pickle(nb_path+'/'+dataframe_base+'_bigrams.pickle')
    print "Pickle loaded."
    
print df_bigrams.head()

# calculate conditional probabilities

df_bi_cond = pd.DataFrame()
for bigram in possible_bigrams:
    let1, let2 = (bigram[0], bigram[1])
    if bigram in df_bigrams.index:
        prob_1_given_2 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_2 == let2].freq.sum()
        prob_2_given_1 = df_bigrams.ix[bigram].freq * 100.0 / df_bigrams[df_bigrams.letter_1 == let1].freq.sum()
        df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[prob_1_given_2], 
                                                     'pct_2_given_1':[prob_2_given_1]}, index=[bigram]))
    else:
        df_bi_cond = df_bi_cond.append(pd.DataFrame({'let1':[let1], 'let2':[let2], 'pct_1_given_2':[0.0], 
                                                     'pct_2_given_1':[0.0]}, index=[bigram]))
        
print df_bi_cond.head()

df_bi_cond.to_csv(nb_path + '/bi_cond' + dataframe_base + '.csv')

# create pivot table - no longer necessary

df_pivot_2g1 = pd.pivot_table(data=df_bi_cond, values='pct_2_given_1', index='let1', columns='let2').fillna(0)
print df_pivot_2g1.head()
df_pivot_1g2 = pd.pivot_table(data=df_bi_cond, values='pct_1_given_2', index='let1', columns='let2').fillna(0)