import pandas as pd
import time
from math import ceil
import pickle
import matplotlib.pyplot as plt
from math import floor, log10
%matplotlib inline

class progress_bar: 
    def __init__(self, loop_length):
        import time
        self.start = time.time()
        self.increment_size = 100.0/loop_length
        self.curr_count = 0
        self.curr_pct = 0
        self.overflow = False
        print '% complete:',
    
    def increment(self):
        self.curr_count += self.increment_size
        if int(self.curr_count) > self.curr_pct:
            self.curr_pct = int(self.curr_count)
            if self.curr_pct <= 100:
                print self.curr_pct, 
            elif self.overflow == False:
                print "\n*!* Count has gone over 100%; likely either due to:\n*!*   - an error in the loop_length specified when " + \
                      "progress_bar was instantiated\n*!*   - an error in the placement of the increment() function"
                print '*!* Elapsed time when progress bar full: %0.1f seconds.' % (time.time() - self.start)
                self.overflow = True

    def finish(self):
        if self.curr_pct == 99:
            print "100", # this is a cheat, because rounding sometimes makes the maximum count 99. One day I'll fix this bug.
        if self.overflow == True:
            print '*!* Elapsed time after end of loop: %0.1f seconds.\n' % (time.time() - self.start)
        else:
            print '\nElapsed time: %0.1f seconds.\n' % (time.time() - self.start)
            
# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(floor((log10(x))))
    val = floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df = df[df.nonalpha == False]
df['year'] = df.decade + 5 # middle of decade
df = df[['word', 'year', 'pct']]
df.sort(['word', 'year'], ascending=True, inplace=True)
print df.head()
words = df.word.unique()
print len(words)

# remove any whose count is 20, i.e. never have a zero value
dfcounts = pd.DataFrame(df.groupby('word').pct.count())
wordcounts20 = list(dfcounts[dfcounts.pct == 20].index)
df = df[~df.word.isin(wordcounts20)]

# make a set of top 1000 words for each year, both by max and by total
topwords = set()
for i in range(1815, 2015, 10):
    dftemp = df[df.year == i]
    dftempmax = dftemp.groupby('word')['pct'].max()
    dftempmax.sort()
    dftemptotal = dftemp.groupby('word')['pct'].sum()
    dftemptotal.sort()
    topwords.update(dftempmax[-1000:].index)
    topwords.update(dftemptotal[-1000:].index)
print len(topwords)
df = df[df.word.isin(topwords)]

# Add missing years as pct 0

pbar = progress_bar(len(df))

# 1000 words at a time
bin_size = 1000
bins = int(ceil(len(words)/bin_size)) + 1
new_word = []
new_year = []
new_pct = []
for i in range(bins):
    loopwords = words[i*bin_size:(i+1)*bin_size]
    loopdf = df[df.word.isin(loopwords)]
    for j in range(len(loopdf)):
        word = loopdf.word.iloc[j]
        year = loopdf.year.iloc[j]
        pbar.increment()
        if j == 0 or word != loopdf.word.iloc[j-1]:
            cur_yr = 1815
        else:
            cur_yr += 10
        while cur_yr < year:
            new_word.append(word)
            new_year.append(cur_yr)
            new_pct.append(0)
            cur_yr += 10
        if j == len(loopdf) - 1 or word != loopdf.word.iloc[j+1]:
            while cur_yr <= 2005 and cur_yr != year:
                new_word.append(word)
                new_year.append(cur_yr)
                new_pct.append(0)
                cur_yr += 10
pbar.finish()

print len(new_word)
print len(df)
df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True)
df.sort(['word', 'year'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle('coha_1_trendiness_checkpoint.pickle')

# sanity check
df_orig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
print df_orig[df_orig.word=="dukakis"]
print df[df.word=="dukakis"]

# add interpolated values for years ending in 0
# so that peaks can be calculated for single-decade words

pbar = progress_bar(len(df))

# 10,000 rows at a time
bin_size = 10000
bins = int(ceil(len(df)/bin_size)) + 1
new_word = []
new_year = []
new_pct = []
for i in range(bins):
    loopdf = df[i*bin_size:(i+1)*bin_size]
    for j in range(len(loopdf)):
        word = loopdf.word.iloc[j]
        year = loopdf.year.iloc[j]
        pbar.increment()
        if j == 0 or word != loopdf.word.iloc[j-1]:
            pass
        else:
            new_word.append(word)
            new_year.append(year - 5)
            avgpct = loopdf.pct.iloc[j]
            avgpct += loopdf.pct.iloc[j-1]
            avgpct /= 2
            new_pct.append(avgpct)
pbar.finish()

print len(new_word)
print len(df)
df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True)
df.sort(['word', 'year'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle('coha_1_trendiness.pickle')

# sanity check dforig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df = pd.read_pickle("coha_1_trendiness.pickle")
df_orig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
print df_orig[df_orig.word == 'dukakis']
print df[df.word == 'dukakis']

wordmax = df.groupby('word').pct.max()
peak_height_cutoff = 0.5

words = []
years_start = []
years_max = []
years_end = []
trendiness = []

pbar = progress_bar(len(df))
for i in range(len(df)):
    pbar.increment()
    year = df.year.iloc[i]
    pct = df.pct.iloc[i]
    if year == 1815:
        word = df.word.iloc[i]
        cur_max = wordmax[word]
        year_start = 0
        year_max = 0
        year_end = 0
        if pct < peak_height_cutoff * cur_max:
            starts_below_cutoff = True
        else:
            starts_below_cutoff = False
    if pct >= peak_height_cutoff * cur_max:
        if year_start == 0 and dips_below_cutoff == True:
            year_start = year
        year_end = year
    else:
        dips_below_cutoff = True
    if pct == cur_max:
        year_max = year
    if (year == 2005 and
        starts_below_cutoff == True and
        pct < peak_height_cutoff * cur_max): # equivalent of ends_below_cutoff
        words.append(word)
        years_start.append(year_start)
        years_max.append(year_max)
        years_end.append(year_end)
        trendiness.append(cur_max / (year_end - year_start))
           
pbar.finish()


trends = pd.DataFrame({'word':words, 'year_start':years_start, 'year_max':years_max, 'year_end':years_end,
                       'trendiness':trendiness})
trends = trends[['word', 'trendiness', 'year_start', 'year_max', 'year_end']]
trends.sort('trendiness', ascending=False, inplace=True)
trends.to_csv('coha_trendiness.csv')
trends.to_pickle('coha_trendiness.pickle')
print len(trends)
print trends.head(50)

print df[df.word == 'reagan']

# top for each decade
for i in range(1825,2005,10):
    print i, trends[trends.year_max == i].word.iloc[0]

# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):                      
    significance = int(floor((log10(x))))
    val = floor(x / (10 ** (significance - 1))) + 1
    val = val * (10 ** (significance - 1))
    return val

def make_chart(df, words, form = 'line', title='', colors= [], smoothing=0,
               baseline='sym', png_name='', ymax=None):
    
    dataframe = df[df['word'].isin(words)]   
    dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word']))
    dataframe.sort(inplace=True, ascending=True)
    
    startyear = min(list(dataframe.index))
    endyear = max(list(dataframe.index))
    yearstr = '%d-%d' % (startyear, endyear)
    
    legend_size = 0.01
    
    max_y = 0
    for word in dataframe.columns:
        max_y = max(max_y, dataframe[word].max())
        final_word = word
        if smoothing > 0:
            newvalues = []
            for row in range(len(dataframe)):
                start = max(0, row - smoothing)
                end = min(len(dataframe) - 1, row + smoothing)
                newvalues.append(dataframe[word].iloc[start:end].mean())
            for row in range(len(dataframe)):
                dataframe[word].iloc[row] = newvalues[row]
    
    y_text = "% of words in corpus"
    
    num_series = len(dataframe.columns)
    
    if colors == []:
        colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
                  "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
    num_colors = len(colors)
    
    if num_series > num_colors:
        print "Warning: colors will be repeated."
    
    x_values = list(dataframe.index)
    y_zeroes = [0] * len(x_values)
    
    if form == 'line':
        fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
        counter = 0
        for word in words:
            color = colors[counter % num_colors]
            counter += 1
            label = word
            ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3)
        if ymax == None:
            ax.set_ylim(0,determine_y_limit(max_y))
        else:
            ax.set_ylim(0, ymax)
        ax.set_title(title, size = 20)
        ax.set_xlim(startyear, endyear)
        ax.set_ylabel(y_text, size = 20)
        ax.set_xticks(range(1810, 2010, 10))
        box = ax.get_position()
        ax.set_position([box.x0, box.y0 + box.height * legend_size,
                 box.width, box.height * (1 - legend_size)])
        legend_cols = min(5, num_series)
        ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16)

    if form == 'subplots_auto':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
        for word in dataframe.columns:
            label = word
            current_ymax = dataframe[word].max()
            tint = 1.0 * current_ymax / determine_y_limit(max_y)
            axes[counter].plot(x_values, dataframe[word], color='k')
            axes[counter].set_ylim(0,determine_y_limit(current_ymax))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[word], color=colors[0], alpha=tint, interpolate=True)

            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
            
    if form == 'subplots_same':
        counter = 0
        fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
        print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
        for word in dataframe.columns:
            label = word
            axes[counter].plot(x_values, dataframe[word], color='k')
            axes[counter].set_ylim(0,determine_y_limit(max_y))
            axes[counter].set_xlim(startyear, endyear)
            axes[counter].fill_between(x_values, dataframe[word], color=colors[1], alpha=1, interpolate=True)
            axes[counter].set_ylabel(label, size=11)
            plt.subplots_adjust(hspace=0.1)
            counter += 1
        
    if form == 'stream':
        figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
        plt.title(title, size=17)        
        plt.xlim(startyear, endyear)
        
        yaxtext = 'Percent of words in corpus'
        
        scale = str(determine_y_limit(max_y)) + ')'
        yaxtext += scale
        plt.ylabel(yaxtext, size=13)
        polys = pyplot.stackplot(x_values, *[dataframe[word] for word in dataframe.columns], 
                                 colors=colors, baseline=baseline)
        legendProxies = []
        for poly in polys:
            legendProxies.append(pyplot.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
        wordlist = []
        for word in dataframe.columns:
            wordlist.append(word)
        plt.legend(legendProxies, wordlist, loc=3, ncol=2)
        
        plt.tick_params(\
            axis='y',          
            which='both',      #  major and minor ticks 
            left='off',      
            right='off',       
            labelleft='off')
        
    plt.show()   
    if png_name != '':
        fileword = save_path + "/" + png_name + ".png"
        plt.savefig(fileword)
    plt.close()

perdecade = ['redwood', 'bradshaw', 'puffer', 'rollo', 'uv', 'elsie', 'ter', 'madonna', 'ivan', 'jimmie', 'coolidge', 'roosevelt', 'planes', 'eisenhower', 'kennedy', 'nixon', 'reagan', 'epa']
print len(perdecade)

print perdecade.index('puffer')

make_chart(df=df, 
           words = perdecade, 
           form = 'line', 
           title='\"Trendiest\" words in Corpus of Historical American English', 
           colors = ["#1f78b4","#ae4ec9","#33a02c","#e31a1c",
                     "#009b89","#b15928"], 
           smoothing=0,
           baseline='sym',
           png_name='',
           ymax = 0.05)

#repeat plot but with repeating six colors in chronological order

words = perdecade
title=''
smoothing=0
ymax=0.05
    
dataframe = df[df['word'].isin(words)]   
dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word']))
dataframe.sort(inplace=True, ascending=True)

startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)

legend_size = 0.01

max_y = 0
for word in dataframe.columns:
    max_y = max(max_y, dataframe[word].max())
    final_word = word
    if smoothing > 0:
        newvalues = []
        for row in range(len(dataframe)):
            start = max(0, row - smoothing)
            end = min(len(dataframe) - 1, row + smoothing)
            newvalues.append(dataframe[word].iloc[start:end].mean())
        for row in range(len(dataframe)):
            dataframe[word].iloc[row] = newvalues[row]

y_text = "% of words in corpus"

num_series = len(dataframe.columns)

colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
              "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]

num_colors = len(colors)

if num_series > num_colors:
    print "Warning: colors will be repeated."

x_values = list(dataframe.index)
y_zeroes = [0] * len(x_values)

fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for word in words:
    color = colors[counter % num_colors]
    counter += 1
    label = word
    ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3)
if ymax == None:
    ax.set_ylim(0,determine_y_limit(max_y))
else:
    ax.set_ylim(0, ymax)
ax.set_title(title, size = 20)
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 20)
ax.set_xticks(range(1810, 2010, 10))
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
         box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16)

top10 = list(trends.word[:10])
make_chart(df=df, 
           words = top10, 
           form = 'line', 
           title='Top 10 \"Trendiest\" words in the Corpus of Historical American English, 1810s-2000s', 
           colors= [], 
           smoothing=0,
           baseline='sym',
           png_name='',
           ymax = 0.045)

# note: they are all the in the top 10 per decade list

make_chart(df=df, 
           words = ['atomic'], 
           form = 'line', 
           title='', 
           colors= ['#ee2222', '#4444aa'], 
           smoothing=0,
           baseline='sym',
           png_name='',
           ymax = 0.015)