## Analysis of "trendiness" by peak height divided by width at 50% height:¶

In [1]:
import pandas as pd
import time
from math import ceil
import pickle
import matplotlib.pyplot as plt
from math import floor, log10
%matplotlib inline

class progress_bar:
def __init__(self, loop_length):
import time
self.start = time.time()
self.increment_size = 100.0/loop_length
self.curr_count = 0
self.curr_pct = 0
self.overflow = False
print '% complete:',

def increment(self):
self.curr_count += self.increment_size
if int(self.curr_count) > self.curr_pct:
self.curr_pct = int(self.curr_count)
if self.curr_pct <= 100:
print self.curr_pct,
elif self.overflow == False:
print "\n*!* Count has gone over 100%; likely either due to:\n*!*   - an error in the loop_length specified when " + \
"progress_bar was instantiated\n*!*   - an error in the placement of the increment() function"
print '*!* Elapsed time when progress bar full: %0.1f seconds.' % (time.time() - self.start)
self.overflow = True

def finish(self):
if self.curr_pct == 99:
print "100", # this is a cheat, because rounding sometimes makes the maximum count 99. One day I'll fix this bug.
if self.overflow == True:
print '*!* Elapsed time after end of loop: %0.1f seconds.\n' % (time.time() - self.start)
else:
print '\nElapsed time: %0.1f seconds.\n' % (time.time() - self.start)

# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):
significance = int(floor((log10(x))))
val = floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val

In [2]:
df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df = df[df.nonalpha == False]
df = df[['word', 'year', 'pct']]
df.sort(['word', 'year'], ascending=True, inplace=True)
words = df.word.unique()
print len(words)

  word  year       pct
0    a  1815  1.763519
1    a  1825  1.901462
2    a  1835  2.061233
3    a  1845  2.073233
4    a  1855  2.069824
337085

In [3]:
# remove any whose count is 20, i.e. never have a zero value
dfcounts = pd.DataFrame(df.groupby('word').pct.count())
wordcounts20 = list(dfcounts[dfcounts.pct == 20].index)
df = df[~df.word.isin(wordcounts20)]

In [4]:
# make a set of top 1000 words for each year, both by max and by total
topwords = set()
for i in range(1815, 2015, 10):
dftemp = df[df.year == i]
dftempmax = dftemp.groupby('word')['pct'].max()
dftempmax.sort()
dftemptotal = dftemp.groupby('word')['pct'].sum()
dftemptotal.sort()
topwords.update(dftempmax[-1000:].index)
topwords.update(dftemptotal[-1000:].index)
print len(topwords)
df = df[df.word.isin(topwords)]

8044

In [5]:
# Add missing years as pct 0

pbar = progress_bar(len(df))

# 1000 words at a time
bin_size = 1000
bins = int(ceil(len(words)/bin_size)) + 1
new_word = []
new_year = []
new_pct = []
for i in range(bins):
loopwords = words[i*bin_size:(i+1)*bin_size]
loopdf = df[df.word.isin(loopwords)]
for j in range(len(loopdf)):
word = loopdf.word.iloc[j]
year = loopdf.year.iloc[j]
pbar.increment()
if j == 0 or word != loopdf.word.iloc[j-1]:
cur_yr = 1815
else:
cur_yr += 10
while cur_yr < year:
new_word.append(word)
new_year.append(cur_yr)
new_pct.append(0)
cur_yr += 10
if j == len(loopdf) - 1 or word != loopdf.word.iloc[j+1]:
while cur_yr <= 2005 and cur_yr != year:
new_word.append(word)
new_year.append(cur_yr)
new_pct.append(0)
cur_yr += 10
pbar.finish()

% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
Elapsed time: 17.7 seconds.


In [6]:
print len(new_word)
print len(df)
df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True)
df.sort(['word', 'year'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle('coha_1_trendiness_checkpoint.pickle')

38407
106758

In [7]:
# sanity check
print df_orig[df_orig.word=="dukakis"]
print df[df.word=="dukakis"]

           word  freq  decade nonalpha  length       pct
650008  dukakis    10    1970    False       7  0.000043
650009  dukakis   570    1980    False       7  0.002344
650010  dukakis    70    1990    False       7  0.000260
650011  dukakis    28    2000    False       7  0.000098
pct     word  year
40608  0.000000  dukakis  1815
40609  0.000000  dukakis  1825
40610  0.000000  dukakis  1835
40611  0.000000  dukakis  1845
40612  0.000000  dukakis  1855
40613  0.000000  dukakis  1865
40614  0.000000  dukakis  1875
40615  0.000000  dukakis  1885
40616  0.000000  dukakis  1895
40617  0.000000  dukakis  1905
40618  0.000000  dukakis  1915
40619  0.000000  dukakis  1925
40620  0.000000  dukakis  1935
40621  0.000000  dukakis  1945
40622  0.000000  dukakis  1955
40623  0.000000  dukakis  1965
40624  0.000043  dukakis  1975
40625  0.002344  dukakis  1985
40626  0.000260  dukakis  1995
40627  0.000098  dukakis  2005

In [8]:
# add interpolated values for years ending in 0
# so that peaks can be calculated for single-decade words

pbar = progress_bar(len(df))

# 10,000 rows at a time
bin_size = 10000
bins = int(ceil(len(df)/bin_size)) + 1
new_word = []
new_year = []
new_pct = []
for i in range(bins):
loopdf = df[i*bin_size:(i+1)*bin_size]
for j in range(len(loopdf)):
word = loopdf.word.iloc[j]
year = loopdf.year.iloc[j]
pbar.increment()
if j == 0 or word != loopdf.word.iloc[j-1]:
pass
else:
new_word.append(word)
new_year.append(year - 5)
avgpct = loopdf.pct.iloc[j]
avgpct += loopdf.pct.iloc[j-1]
avgpct /= 2
new_pct.append(avgpct)
pbar.finish()

% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
Elapsed time: 24.9 seconds.


In [9]:
print len(new_word)
print len(df)
df = df.append(pd.DataFrame({'word':new_word, 'year':new_year, 'pct':new_pct}), ignore_index = True)
df.sort(['word', 'year'], ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_pickle('coha_1_trendiness.pickle')

137108
145165

In [10]:
# sanity check dforig = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
print df_orig[df_orig.word == 'dukakis']
print df[df.word == 'dukakis']

           word  freq  decade nonalpha  length       pct
650008  dukakis    10    1970    False       7  0.000043
650009  dukakis   570    1980    False       7  0.002344
650010  dukakis    70    1990    False       7  0.000260
650011  dukakis    28    2000    False       7  0.000098
pct     word  year
78975  0.000000  dukakis  1815
78976  0.000000  dukakis  1820
78977  0.000000  dukakis  1825
78978  0.000000  dukakis  1830
78979  0.000000  dukakis  1835
78980  0.000000  dukakis  1840
78981  0.000000  dukakis  1845
78982  0.000000  dukakis  1850
78983  0.000000  dukakis  1855
78984  0.000000  dukakis  1860
78985  0.000000  dukakis  1865
78986  0.000000  dukakis  1870
78987  0.000000  dukakis  1875
78988  0.000000  dukakis  1880
78989  0.000000  dukakis  1885
78990  0.000000  dukakis  1890
78991  0.000000  dukakis  1895
78992  0.000000  dukakis  1900
78993  0.000000  dukakis  1905
78994  0.000000  dukakis  1910
78995  0.000000  dukakis  1915
78996  0.000000  dukakis  1920
78997  0.000000  dukakis  1925
78998  0.000000  dukakis  1930
78999  0.000000  dukakis  1935
79000  0.000000  dukakis  1940
79001  0.000000  dukakis  1945
79002  0.000000  dukakis  1950
79003  0.000000  dukakis  1955
79004  0.000000  dukakis  1960
79005  0.000000  dukakis  1965
79006  0.000022  dukakis  1970
79007  0.000043  dukakis  1975
79008  0.001194  dukakis  1980
79009  0.002344  dukakis  1985
79010  0.001302  dukakis  1990
79011  0.000260  dukakis  1995
79012  0.000179  dukakis  2000
79013  0.000098  dukakis  2005


## Calculate peaks¶

In [11]:
wordmax = df.groupby('word').pct.max()
peak_height_cutoff = 0.5

words = []
years_start = []
years_max = []
years_end = []
trendiness = []

pbar = progress_bar(len(df))
for i in range(len(df)):
pbar.increment()
year = df.year.iloc[i]
pct = df.pct.iloc[i]
if year == 1815:
word = df.word.iloc[i]
cur_max = wordmax[word]
year_start = 0
year_max = 0
year_end = 0
if pct < peak_height_cutoff * cur_max:
starts_below_cutoff = True
else:
starts_below_cutoff = False
if pct >= peak_height_cutoff * cur_max:
if year_start == 0 and dips_below_cutoff == True:
year_start = year
year_end = year
else:
dips_below_cutoff = True
if pct == cur_max:
year_max = year
if (year == 2005 and
starts_below_cutoff == True and
pct < peak_height_cutoff * cur_max): # equivalent of ends_below_cutoff
words.append(word)
years_start.append(year_start)
years_max.append(year_max)
years_end.append(year_end)
trendiness.append(cur_max / (year_end - year_start))

pbar.finish()

% complete: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
Elapsed time: 20.8 seconds.


In [12]:
trends = pd.DataFrame({'word':words, 'year_start':years_start, 'year_max':years_max, 'year_end':years_end,
'trendiness':trendiness})
trends = trends[['word', 'trendiness', 'year_start', 'year_max', 'year_end']]
trends.sort('trendiness', ascending=False, inplace=True)
trends.to_csv('coha_trendiness.csv')
trends.to_pickle('coha_trendiness.pickle')
print len(trends)

3941
word  trendiness  year_start  year_max  year_end
2955      reagan    0.003379        1980      1985      1990
2572       nixon    0.002831        1970      1975      1980
3687          uv    0.002767        1860      1865      1870
1931     kennedy    0.002747        1960      1965      1970
1099  eisenhower    0.002241        1950      1955      1960
3525         ter    0.001686        1880      1885      1890
734    communist    0.001660        1950      1955      1965
2793      planes    0.001219        1940      1945      1950
1870      jimmie    0.001180        1910      1915      1920
770     coolidge    0.001105        1920      1925      1930
1128       elsie    0.001072        1870      1875      1880
438     bradshaw    0.001070        1830      1835      1840
1965       korea    0.001056        1950      1955      1960
3064       rollo    0.001039        1850      1855      1860
3736     vietnam    0.001029        1960      1965      1975
3071   roosevelt    0.001027        1930      1935      1950
1917        katy    0.000983        1860      1865      1870
1458      graeme    0.000935        1860      1865      1870
1102     eleanor    0.000932        1920      1925      1930
3871    winthrop    0.000931        1850      1855      1860
1843        jeff    0.000906        1950      1955      1960
2183   madeleine    0.000891        1860      1865      1870
898         dave    0.000881        1910      1915      1920
735   communists    0.000877        1950      1955      1965
2006       lanny    0.000862        1940      1945      1950
1058      dulles    0.000836        1950      1955      1960
2686          pa    0.000820        1880      1885      1890
107          amy    0.000808        1860      1865      1870
1869       jimbo    0.000797        1970      1975      1980
1800    isabella    0.000783        1830      1835      1840
1952   kissinger    0.000781        1970      1975      1980
3340      soviet    0.000767        1950      1955      1990
2971     redwood    0.000761        1820      1825      1830
964        dewey    0.000757        1940      1945      1950
3407      stitch    0.000749        1870      1875      1880
1528       gypsy    0.000737        1860      1865      1870
1639         hev    0.000733        1860      1865      1870
1661      hitler    0.000718        1935      1945      1950
1131      elvira    0.000714        1820      1825      1830
2319         mcs    0.000714        1950      1955      1960
201       atomic    0.000709        1945      1955      1960
841         cuba    0.000687        1960      1965      1970
76    alessandro    0.000685        1880      1885      1890
3846     wilford    0.000683        1860      1865      1870
3632      truman    0.000681        1945      1955      1960
2211      malone    0.000672        1960      1965      1970
2193    magdalen    0.000665        1870      1875      1880
1966      korean    0.000663        1950      1955      1960
3092     rowland    0.000662        1870      1875      1880
3403   stevenson    0.000652        1950      1955      1960

In [13]:
print df[df.word == 'reagan']

             pct    word  year
211087  0.000000  reagan  1815
211088  0.000000  reagan  1820
211089  0.000000  reagan  1825
211090  0.000000  reagan  1830
211091  0.000000  reagan  1835
211092  0.000000  reagan  1840
211093  0.000000  reagan  1845
211094  0.000000  reagan  1850
211095  0.000000  reagan  1855
211096  0.000003  reagan  1860
211097  0.000006  reagan  1865
211098  0.000025  reagan  1870
211099  0.000044  reagan  1875
211100  0.000065  reagan  1880
211101  0.000086  reagan  1885
211102  0.000083  reagan  1890
211103  0.000079  reagan  1895
211104  0.000049  reagan  1900
211105  0.000019  reagan  1905
211106  0.000014  reagan  1910
211107  0.000009  reagan  1915
211108  0.000007  reagan  1920
211109  0.000004  reagan  1925
211110  0.000006  reagan  1930
211111  0.000008  reagan  1935
211112  0.000023  reagan  1940
211113  0.000038  reagan  1945
211114  0.000038  reagan  1950
211115  0.000038  reagan  1955
211116  0.001042  reagan  1960
211117  0.002045  reagan  1965
211118  0.002479  reagan  1970
211119  0.002912  reagan  1975
211120  0.018351  reagan  1980
211121  0.033789  reagan  1985
211122  0.018466  reagan  1990
211123  0.003143  reagan  1995
211124  0.002649  reagan  2000
211125  0.002156  reagan  2005

In [14]:
# top for each decade
for i in range(1825,2005,10):
print i, trends[trends.year_max == i].word.iloc[0]

1825 redwood
1845 puffer
1855 rollo
1865 uv
1875 elsie
1885 ter
1905 ivan
1915 jimmie
1925 coolidge
1935 roosevelt
1945 planes
1955 eisenhower
1965 kennedy
1975 nixon
1985 reagan
1995 epa

In [15]:
# used to round limit of y axis up to second-most-significant digit
def determine_y_limit(x):
significance = int(floor((log10(x))))
val = floor(x / (10 ** (significance - 1))) + 1
val = val * (10 ** (significance - 1))
return val

def make_chart(df, words, form = 'line', title='', colors= [], smoothing=0,
baseline='sym', png_name='', ymax=None):

dataframe = df[df['word'].isin(words)]
dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word']))
dataframe.sort(inplace=True, ascending=True)

startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)

legend_size = 0.01

max_y = 0
for word in dataframe.columns:
max_y = max(max_y, dataframe[word].max())
final_word = word
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[word].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[word].iloc[row] = newvalues[row]

y_text = "% of words in corpus"

num_series = len(dataframe.columns)

if colors == []:
colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]
num_colors = len(colors)

if num_series > num_colors:
print "Warning: colors will be repeated."

x_values = list(dataframe.index)
y_zeroes = [0] * len(x_values)

if form == 'line':
fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for word in words:
color = colors[counter % num_colors]
counter += 1
label = word
ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3)
if ymax == None:
ax.set_ylim(0,determine_y_limit(max_y))
else:
ax.set_ylim(0, ymax)
ax.set_title(title, size = 20)
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 20)
ax.set_xticks(range(1810, 2010, 10))
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16)

if form == 'subplots_auto':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum alpha: %d percent' % (determine_y_limit(max_y))
for word in dataframe.columns:
label = word
current_ymax = dataframe[word].max()
tint = 1.0 * current_ymax / determine_y_limit(max_y)
axes[counter].plot(x_values, dataframe[word], color='k')
axes[counter].set_ylim(0,determine_y_limit(current_ymax))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[word], color=colors[0], alpha=tint, interpolate=True)

axes[counter].set_ylabel(label, size=11)
counter += 1

if form == 'subplots_same':
counter = 0
fig, axes = plt.subplots(num_series, 1, figsize=(12, 3.5*num_series))
print 'Maximum y axis: %d percent' % (determine_y_limit(max_y))
for word in dataframe.columns:
label = word
axes[counter].plot(x_values, dataframe[word], color='k')
axes[counter].set_ylim(0,determine_y_limit(max_y))
axes[counter].set_xlim(startyear, endyear)
axes[counter].fill_between(x_values, dataframe[word], color=colors[1], alpha=1, interpolate=True)
axes[counter].set_ylabel(label, size=11)
counter += 1

if form == 'stream':
figure(num=None, figsize=(20,10), dpi=150, facecolor='w', edgecolor='k')
plt.title(title, size=17)
plt.xlim(startyear, endyear)

yaxtext = 'Percent of words in corpus'

scale = str(determine_y_limit(max_y)) + ')'
yaxtext += scale
plt.ylabel(yaxtext, size=13)
polys = pyplot.stackplot(x_values, *[dataframe[word] for word in dataframe.columns],
colors=colors, baseline=baseline)
legendProxies = []
for poly in polys:
legendProxies.append(pyplot.Rectangle((0, 0), 1, 1, fc=poly.get_facecolor()[0]))
wordlist = []
for word in dataframe.columns:
wordlist.append(word)
plt.legend(legendProxies, wordlist, loc=3, ncol=2)

plt.tick_params(\
axis='y',
which='both',      #  major and minor ticks
left='off',
right='off',
labelleft='off')

plt.show()
if png_name != '':
fileword = save_path + "/" + png_name + ".png"
plt.savefig(fileword)
plt.close()

In [16]:
perdecade = ['redwood', 'bradshaw', 'puffer', 'rollo', 'uv', 'elsie', 'ter', 'madonna', 'ivan', 'jimmie', 'coolidge', 'roosevelt', 'planes', 'eisenhower', 'kennedy', 'nixon', 'reagan', 'epa']


18
2

In [17]:
make_chart(df=df,
form = 'line',
title='\"Trendiest\" words in Corpus of Historical American English',
colors = ["#1f78b4","#ae4ec9","#33a02c","#e31a1c",
"#009b89","#b15928"],
smoothing=0,
baseline='sym',
png_name='',
ymax = 0.05)

Warning: colors will be repeated.

In [19]:
#repeat plot but with repeating six colors in chronological order

title=''
smoothing=0
ymax=0.05

dataframe = df[df['word'].isin(words)]
dataframe = pd.DataFrame(pd.pivot_table(dataframe, values='pct', index = 'year', columns=['word']))
dataframe.sort(inplace=True, ascending=True)

startyear = min(list(dataframe.index))
endyear = max(list(dataframe.index))
yearstr = '%d-%d' % (startyear, endyear)

legend_size = 0.01

max_y = 0
for word in dataframe.columns:
max_y = max(max_y, dataframe[word].max())
final_word = word
if smoothing > 0:
newvalues = []
for row in range(len(dataframe)):
start = max(0, row - smoothing)
end = min(len(dataframe) - 1, row + smoothing)
newvalues.append(dataframe[word].iloc[start:end].mean())
for row in range(len(dataframe)):
dataframe[word].iloc[row] = newvalues[row]

y_text = "% of words in corpus"

num_series = len(dataframe.columns)

colors = ["#1f78b4","#ae4ec9","#33a02c","#fb9a99","#e31a1c","#a6cee3",
"#fdbf6f","#ff7f00","#cab2d6","#6a3d9a","#ffff99","#b15928"]

num_colors = len(colors)

if num_series > num_colors:
print "Warning: colors will be repeated."

x_values = list(dataframe.index)
y_zeroes = [0] * len(x_values)

fig, ax = plt.subplots(num=None, figsize=(16, 9), dpi=300, facecolor='w', edgecolor='w')
counter = 0
for word in words:
color = colors[counter % num_colors]
counter += 1
label = word
ax.plot(x_values, dataframe[word], label=label, color=color, linewidth = 3)
if ymax == None:
ax.set_ylim(0,determine_y_limit(max_y))
else:
ax.set_ylim(0, ymax)
ax.set_title(title, size = 20)
ax.set_xlim(startyear, endyear)
ax.set_ylabel(y_text, size = 20)
ax.set_xticks(range(1810, 2010, 10))
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * legend_size,
box.width, box.height * (1 - legend_size)])
legend_cols = min(5, num_series)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=legend_cols, fontsize=16)

Warning: colors will be repeated.

Out[19]:
<matplotlib.legend.Legend at 0xcb06358>
In [20]:
top10 = list(trends.word[:10])
make_chart(df=df,
words = top10,
form = 'line',
title='Top 10 \"Trendiest\" words in the Corpus of Historical American English, 1810s-2000s',
colors= [],
smoothing=0,
baseline='sym',
png_name='',
ymax = 0.045)

# note: they are all the in the top 10 per decade list

In [23]:
make_chart(df=df,
words = ['atomic'],
form = 'line',
title='',
colors= ['#ee2222', '#4444aa'],
smoothing=0,
baseline='sym',
png_name='',
ymax = 0.015)