import pandas as pd import time from math import ceil import pickle import matplotlib.pyplot as plt from math import floor, log10 %matplotlib inline class progress_bar: def __init__(self, loop_length): import time self.start = time.time() self.increment_size = 100.0/loop_length self.curr_count = 0 self.curr_pct = 0 self.overflow = False print '% complete:', def increment(self): self.curr_count += self.increment_size if int(self.curr_count) > self.curr_pct: self.curr_pct = int(self.curr_count) if self.curr_pct <= 100: print self.curr_pct, elif self.overflow == False: print "\n*!* Count has gone over 100%; likely either due to:\n*!* - an error in the loop_length specified when " + \ "progress_bar was instantiated\n*!* - an error in the placement of the increment() function" print '*!* Elapsed time when progress bar full: %0.1f seconds.' % (time.time() - self.start) self.overflow = True def finish(self): if self.curr_pct == 99: print "100", # this is a cheat, because rounding sometimes makes the maximum count 99. One day I'll fix this bug. if self.overflow == True: print '*!* Elapsed time after end of loop: %0.1f seconds.\n' % (time.time() - self.start) else: print '\nElapsed time: %0.1f seconds.\n' % (time.time() - self.start) # used to round limit of y axis up to second-most-significant digit def determine_y_limit(x): significance = int(floor((log10(x)))) val = floor(x / (10 ** (significance - 1))) + 1 val = val * (10 ** (significance - 1)) return val df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle") df = df[df.nonalpha == False] df['year'] = df.decade + 5 # middle of decade df = df[['word', 'year', 'pct']] df.sort(['word', 'year'], ascending=True, inplace=True) print df.head() words = df.word.unique() print len(words) # remove any whose count is 20, i.e. never have a zero value dfcounts = pd.DataFrame(df.groupby('word').pct.count()) wordcounts20 = list(dfcounts[dfcounts.pct == 20].index) df = df[~df.word.isin(wordcounts20)] # make a set of top 1000 words for each year, both by max and by total topwords = set() for i in range(1815, 2015, 10): dftemp = df[df.year == i] dftempmax = dftemp.groupby('word')['pct'].max() dftempmax.sort() dftemptotal = dftemp.groupby('word')['pct'].sum() dftemptotal.sort() topwords.update(dftempmax[-1000:].index) topwords.update(dftemptotal[-1000:].index) print len(topwords) df = df[df.word.isin(topwords)] # Add missing years as pct 0 pbar = progress_bar(len(df)) # 1000 words at a time bin_size = 1000 bins = int(ceil(len(words)/bin_size)) + 1 new_word = [] new_year = [] new_pct = [] for i in range(bins): loopwords = words[i*bin_size:(i+1)*bin_size] loopdf = df[df.word.isin(loopwords)] for j in range(len(loopdf)): word = loopdf.word.iloc[j] year = loopdf.year.iloc[j] pbar.increment() if j == 0 or word != loopdf.word.iloc[j-1]: cur_yr = 1815 else: cur_yr += 10 while cur_yr < year: new_word.append(word) new_year.append(cur_yr) new_pct.append(0) cur_yr += 10 if j == len(loopdf) - 1 or word != loopdf.word.iloc[j+1]: while cur_yr <= 2005 and cur_yr != year: new_word.append(word) new_year.append(cur_yr) new_pct.append(0) cur_yr += 10 pbar.finish() print len(new_word) print len(df) df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True) df.sort(['word', 'year'], ascending=True, inplace=True) df.reset_index(drop=True, inplace=True) df.to_pickle('coha_1_trendiness_checkpoint.pickle') # sanity check df_orig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle") print df_orig[df_orig.word=="dukakis"] print df[df.word=="dukakis"] # add interpolated values for years ending in 0 # so that peaks can be calculated for single-decade words pbar = progress_bar(len(df)) # 10,000 rows at a time bin_size = 10000 bins = int(ceil(len(df)/bin_size)) + 1 new_word = [] new_year = [] new_pct = [] for i in range(bins): loopdf = df[i*bin_size:(i+1)*bin_size] for j in range(len(loopdf)): word = loopdf.word.iloc[j] year = loopdf.year.iloc[j] pbar.increment() if j == 0 or word != loopdf.word.iloc[j-1]: pass else: new_word.append(word) new_year.append(year - 5) avgpct = loopdf.pct.iloc[j] avgpct += loopdf.pct.iloc[j-1] avgpct /= 2 new_pct.append(avgpct) pbar.finish() print len(new_word) print len(df) df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True) df.sort(['word', 'year'], ascending=True, inplace=True) df.reset_index(drop=True, inplace=True) df.to_pickle('coha_1_trendiness.pickle') # sanity check dforig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle") df = pd.read_pickle("coha_1_trendiness.pickle") df_orig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle") print df_orig[df_orig.word == 'dukakis'] print df[df.word == 'dukakis'] wordmax = df.groupby('word').pct.max() peak_height_cutoff = 0.5 words = [] years_start = [] years_max = [] years_end = [] trendiness = [] pbar = progress_bar(len(df)) for i in range(len(df)): pbar.increment() year = df.year.iloc[i] pct = df.pct.iloc[i] if year == 1815: word = df.word.iloc[i] cur_max = wordmax[word] year_start = 0 year_max = 0 year_end = 0 if pct < peak_height_cutoff * cur_max: starts_below_cutoff = True else: starts_below_cutoff = False if pct >= peak_height_cutoff * cur_max: if year_start == 0 and dips_below_cutoff == True: year_start = year year_end = year else: dips_below_cutoff = True if pct == cur_max: year_max = year if (year == 2005 and starts_below_cutoff == True and pct < peak_height_cutoff * cur_max): # equivalent of ends_below_cutoff words.append(word) years_start.append(year_start) years_max.append(year_max) years_end.append(year_end) trendiness.append(cur_max / (year_end - year_start)) pbar.finish() trends = pd.DataFrame({'word':words, 'year_start':years_start, 'year_max':years_max, 'year_end':years_end, 'trendiness':trendiness}) trends = trends[['word', 'trendiness', 'year_start', 'year_max', 'year_end']] trends.sort('trendiness', ascending=False, inplace=True) trends.to_csv('coha_trendiness.csv') trends.to_pickle('coha_trendiness.pickle') print len(trends) print trends.head(50) print df[df.word == 'reagan'] # top for each decade for i in range(1825,2005,10): print i, trends[trends.year_max == i].word.iloc[0] # used to round limit of y axis up to second-most-significant digit def determine_y_limit(x): significance = int(floor((log10(x)))) val = floor(x / (10 ** (significance - 1))) + 1 val = val * (10 ** (significance - 1)) return val def make_chart(df, words, form = 'line', title='', colors= [], smoothing=0, baseline='sym', png_name='', ymax=None): dataframe = df[df['word'].isin(words)] dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word'])) dataframe.sort(inplace=True, ascending=True) startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 max_y = 0 for word in dataframe.columns: max_y = max(max_y, dataframe[word].max()) final_word = word if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[word].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[word].iloc[row] = newvalues[row] y_text = "% of words in corpus" num_series = len(dataframe.columns) if colors == []: colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3", "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"] num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." x_values = list(dataframe.index) y_zeroes = [0] * len(x_values) if form == 'line': fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for word in words: color = colors[counter % num_colors] counter += 1 label = word ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3) if ymax == None: ax.set_ylim(0,determine_y_limit(max_y)) else: ax.set_ylim(0, ymax) ax.set_title(title, size = 20) ax.set_xlim(startyear, endyear) ax.set_ylabel(y_text, size = 20) ax.set_xticks(range(1810, 2010, 10)) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16) if form == 'subplots_auto': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum alpha: %d percent' % (determine_y_limit(max_y)) for word in dataframe.columns: label = word current_ymax = dataframe[word].max() tint = 1.0 * current_ymax / determine_y_limit(max_y) axes[counter].plot(x_values, dataframe[word], color='k') axes[counter].set_ylim(0,determine_y_limit(current_ymax)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[word], color=colors[0], alpha=tint, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'subplots_same': counter = 0 fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series)) print 'Maximum y axis: %d percent' % (determine_y_limit(max_y)) for word in dataframe.columns: label = word axes[counter].plot(x_values, dataframe[word], color='k') axes[counter].set_ylim(0,determine_y_limit(max_y)) axes[counter].set_xlim(startyear, endyear) axes[counter].fill_between(x_values, dataframe[word], color=colors[1], alpha=1, interpolate=True) axes[counter].set_ylabel(label, size=11) plt.subplots_adjust(hspace=0.1) counter += 1 if form == 'stream': figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k') plt.title(title, size=17) plt.xlim(startyear, endyear) yaxtext = 'Percent of words in corpus' scale = str(determine_y_limit(max_y)) + ')' yaxtext += scale plt.ylabel(yaxtext, size=13) polys = pyplot.stackplot(x_values, *[dataframe[word] for word in dataframe.columns], colors=colors, baseline=baseline) legendProxies = [] for poly in polys: legendProxies.append(pyplot.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0])) wordlist = [] for word in dataframe.columns: wordlist.append(word) plt.legend(legendProxies, wordlist, loc=3, ncol=2) plt.tick_params(\ axis='y', which='both', # major and minor ticks left='off', right='off', labelleft='off') plt.show() if png_name != '': fileword = save_path + "/" + png_name + ".png" plt.savefig(fileword) plt.close() perdecade = ['redwood', 'bradshaw', 'puffer', 'rollo', 'uv', 'elsie', 'ter', 'madonna', 'ivan', 'jimmie', 'coolidge', 'roosevelt', 'planes', 'eisenhower', 'kennedy', 'nixon', 'reagan', 'epa'] print len(perdecade) print perdecade.index('puffer') make_chart(df=df, words = perdecade, form = 'line', title='\"Trendiest\" words in Corpus of Historical American English', colors = ["#1f78b4","#ae4ec9","#33a02c","#e31a1c", "#009b89","#b15928"], smoothing=0, baseline='sym', png_name='', ymax = 0.05) #repeat plot but with repeating six colors in chronological order words = perdecade title='' smoothing=0 ymax=0.05 dataframe = df[df['word'].isin(words)] dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word'])) dataframe.sort(inplace=True, ascending=True) startyear = min(list(dataframe.index)) endyear = max(list(dataframe.index)) yearstr = '%d-%d' % (startyear, endyear) legend_size = 0.01 max_y = 0 for word in dataframe.columns: max_y = max(max_y, dataframe[word].max()) final_word = word if smoothing > 0: newvalues = [] for row in range(len(dataframe)): start = max(0, row - smoothing) end = min(len(dataframe) - 1, row + smoothing) newvalues.append(dataframe[word].iloc[start:end].mean()) for row in range(len(dataframe)): dataframe[word].iloc[row] = newvalues[row] y_text = "% of words in corpus" num_series = len(dataframe.columns) colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3", "#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"] num_colors = len(colors) if num_series > num_colors: print "Warning: colors will be repeated." x_values = list(dataframe.index) y_zeroes = [0] * len(x_values) fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w') counter = 0 for word in words: color = colors[counter % num_colors] counter += 1 label = word ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3) if ymax == None: ax.set_ylim(0,determine_y_limit(max_y)) else: ax.set_ylim(0, ymax) ax.set_title(title, size = 20) ax.set_xlim(startyear, endyear) ax.set_ylabel(y_text, size = 20) ax.set_xticks(range(1810, 2010, 10)) box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * legend_size, box.width, box.height * (1 - legend_size)]) legend_cols = min(5, num_series) ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16) top10 = list(trends.word[:10]) make_chart(df=df, words = top10, form = 'line', title='Top 10 \"Trendiest\" words in the Corpus of Historical American English, 1810s-2000s', colors= [], smoothing=0, baseline='sym', png_name='', ymax = 0.045) # note: they are all the in the top 10 per decade list make_chart(df=df, words = ['atomic'], form = 'line', title='', colors= ['#ee2222', '#4444aa'], smoothing=0, baseline='sym', png_name='', ymax = 0.015)