Top words per decade appeared

In [1]:
import pandas as pd
import time
from math import ceil
import pickle
In [2]:
# load pickle of all words and decades and remove those that appear in more than 10 decades

df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
origlen = len(df)
origwds = len(df.word.unique())
df = df[df.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df = df[df.word.isin(wordcount.index)]
df = df[['word', 'decade', 'pct']]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), 
                                                          len(df.word.unique())*100.0/origwds)
print df.head(10)
2539728 records reduced to 1214911 (47.8 %)
436103 words reduced to 289826 (66.5 %)
    word  decade       pct
102  aaa    1850  0.000006
103  aaa    1910  0.000009
104  aaa    1920  0.000008
105  aaa    1930  0.001382
106  aaa    1940  0.000170
107  aaa    1950  0.000110
108  aaa    1960  0.000035
109  aaa    1970  0.000052
110  aaa    1980  0.000070
111  aaa    1990  0.000319
In [3]:
#keep words in crossword dictionary, e.g. not proper nouns

origlen = len(df)
origwds = len(df.word.unique())

import json
xwords = json.loads(open('../data_user_pickle_csv/coha_and_xword.json', 'r').read())
df = df[df.word.isin(xwords)]

print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), 
                                                          len(df.word.unique())*100.0/origwds)
1214911 records reduced to 313863 (25.8 %)
289826 words reduced to 41068 (14.2 %)
In [4]:
# keep top 10000 words in terms of max and sum

origlen = len(df)
origwds = len(df.word.unique())

dfsum = pd.DataFrame(df.groupby('word').pct.sum())
dfsum.sort('pct', ascending=False, inplace=True)
dfsum = dfsum[:10000]
dfmax = pd.DataFrame(df.groupby('word').pct.max())
dfmax.sort('pct', ascending=False, inplace=True)
dfmax = dfmax[:10000]

df = df[(df.word.isin(dfsum.index)) | (df.word.isin(dfmax.index))]

print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), 
                                                          len(df.word.unique())*100.0/origwds)
313863 records reduced to 128688 (41.0 %)
41068 words reduced to 11740 (28.6 %)
In [5]:
# add sum per decade to dfsum

series_count = df.groupby('word').decade.count()

dfsum['pct_per_decade'] = 0.0
dfsum['decades'] = 0
dfsum['decade_specificity'] = 0.0

for i in range(len(dfsum)):
    dfsum.pct_per_decade.iloc[i] = (dfsum.pct[i] /
                                    series_count[dfsum.index[i]])
    dfsum.decades[i] = series_count[dfsum.index[i]]
    dfsum.decade_specificity[i] = 20 - series_count[dfsum.index[i]]
    
dfsum.sort('pct_per_decade', ascending=False, inplace=True)
    
In [6]:
print dfsum.head(50)
                    pct  pct_per_decade  decades  decade_specificity
word                                                                
soviet         0.133485        0.011124       12                   8
radio          0.102161        0.007859       13                   7
phone          0.106324        0.007595       14                   6
television     0.071525        0.006502       11                   9
okay           0.063832        0.006383       10                  10
telephone      0.092873        0.006192       15                   5
movie          0.056559        0.005142       11                   9
programs       0.062321        0.004155       15                   5
nuclear        0.047605        0.003967       12                   8
computer       0.045346        0.003779       12                   8
cigarette      0.053111        0.003541       15                   5
airport        0.031493        0.003499        9                  11
automobile     0.038150        0.002935       13                   7
photo          0.043835        0.002922       15                   5
sutta          0.008483        0.002828        3                  17
movies         0.030464        0.002769       11                   9
baseball       0.037568        0.002505       15                   5
shit           0.027538        0.002295       12                   8
weekend        0.024362        0.002215       11                   9
unemployment   0.026051        0.002171       12                   8
concept        0.032165        0.002144       15                   5
aircraft       0.027237        0.002095       13                   7
scheduled      0.027750        0.001982       14                   6
fucking        0.013840        0.001977        7                  13
parking        0.023615        0.001968       12                   8
golf           0.029198        0.001947       15                   5
global         0.019391        0.001939       10                  10
environmental  0.023204        0.001934       12                   8
garage         0.022876        0.001906       12                   8
brittles       0.001743        0.001743        1                  19
soviets        0.016589        0.001659       10                  10
computers      0.014902        0.001656        9                  11
fizgig         0.001648        0.001648        1                  19
almah          0.001642        0.001642        1                  19
buddy          0.019513        0.001626       12                   8
fuck           0.014576        0.001620        9                  11
cloddy         0.009557        0.001593        6                  14
gasoline       0.022212        0.001587       14                   6
output         0.023681        0.001579       15                   5
electronic     0.015746        0.001575       10                  10
racial         0.023483        0.001566       15                   5
airplane       0.017098        0.001554       11                   9
nazi           0.018378        0.001532       12                   8
regional       0.021399        0.001528       14                   6
airlines       0.013531        0.001503        9                  11
skills         0.022295        0.001486       15                   5
basketball     0.017710        0.001476       12                   8
techniques     0.018977        0.001460       13                   7
taxi           0.018859        0.001451       13                   7
video          0.017279        0.001440       12                   8
In [8]:
# for contrast, let's see proper nouns

df_proper = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df_proper = df_proper[df_proper.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df_proper.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df_proper = df_proper[df_proper.word.isin(wordcount.index)]
df_proper = df_proper[['word', 'decade', 'pct']]
df_propersum = pd.DataFrame(df_proper.groupby('word').pct.sum())
df_propersum.sort('pct', ascending=False, inplace=True)
df_propersum = df_propersum[:10000]
df_propermax = pd.DataFrame(df_proper.groupby('word').pct.max())
df_propermax.sort('pct', ascending=False, inplace=True)
df_propermax = df_propermax[:10000]
df_proper = df_proper[(df_proper.word.isin(df_propersum.index)) | (df_proper.word.isin(df_propermax.index))]
proper_series_count = df_proper.groupby('word').decade.count()
df_propersum['pct_per_decade'] = 0.0
for i in range(len(df_propersum)):
    df_propersum.pct_per_decade.iloc[i] = (df_propersum.pct[i] /
                                    proper_series_count[df_propersum.index[i]])
df_propersum.sort('pct_per_decade', ascending=False, inplace=True)
df_propersum = df_propersum[~df_propersum.index.isin(dfsum.index)]
print df_propersum.head(50)
df_propersum[:50].to_csv('coha_top_omitted_proper_nouns.csv')
                 pct  pct_per_decade
word                                
dorriville  0.033207        0.033207
altorf      0.042033        0.021016
madiboo     0.018765        0.018765
selico      0.018074        0.018074
pacomo      0.016863        0.016863
pufpace     0.016171        0.016171
brazzo      0.015393        0.015393
lescourt    0.013923        0.013923
rossberg    0.027246        0.013623
rheinthal   0.011415        0.011415
plotwell    0.011242        0.011242
fourbin     0.010983        0.010983
immorina    0.010983        0.010983
ridolpho    0.010810        0.010810
bertocci    0.010118        0.010118
demba       0.010118        0.010118
torribal    0.009858        0.009858
devalmore   0.009512        0.009512
erlach      0.037984        0.009496
lesc        0.009426        0.009426
ploughby    0.009253        0.009253
eberard     0.018079        0.009040
makesafe    0.008994        0.008994
ksenia      0.008994        0.008994
joblin      0.017905        0.008952
mentzikoff  0.008648        0.008648
usaldo      0.008561        0.008561
ubal        0.008475        0.008475
almeyda     0.016175        0.008088
hippolito   0.015397        0.007699
barogo      0.007610        0.007610
beraldo     0.015061        0.007531
hardrun     0.007523        0.007523
arandez     0.007351        0.007351
maillac     0.007091        0.007091
mahadi      0.007091        0.007091
bloomville  0.028237        0.007059
spendall    0.007005        0.007005
lanissa     0.006659        0.006659
spicket     0.006287        0.006287
ridol       0.006226        0.006226
shenac      0.006137        0.006137
rainouard   0.006053        0.006053
flaurence   0.005967        0.005967
wildenhain  0.017735        0.005912
cerval      0.011765        0.005883
oresca      0.005794        0.005794
quicksite   0.005448        0.005448
darina      0.005362        0.005362
chetwynde   0.005292        0.005292
In [25]:
# make pivot table showing in which decades words occurred

decades = range(1810, 2010, 10)
dftop = dfsum[:50]
dftoplookup = df.copy()
for decade in decades:
    dftop[decade] = 0.0
for i in range(len(dftop)):
    for decade in decades:
        if len(dftoplookup[(dftoplookup.word == dftop.index[i]) &
                           (dftoplookup.decade == decade)]) > 0:
            dftop[decade].iloc[i] = dftoplookup[(dftoplookup.word == dftop.index[i]) &
                                                (dftoplookup.decade == decade)].pct.iloc[0]
In [26]:
print dftop.head()
                 pct  pct_per_decade  decades  decade_specificity  1810  1820  \
word                                                                            
soviet      0.133485        0.011124       12                   8     0     0   
radio       0.102161        0.007859       13                   7     0     0   
phone       0.106324        0.007595       14                   6     0     0   
television  0.071525        0.006502       11                   9     0     0   
okay        0.063832        0.006383       10                  10     0     0   

            1830  1840  1850  1860  ...       1910      1920      1930  \
word                                ...                                  
soviet         0     0     0     0  ...   0.000208  0.004427  0.006941   
radio          0     0     0     0  ...   0.000253  0.005014  0.012146   
phone          0     0     0     0  ...   0.001941  0.001890  0.005214   
television     0     0     0     0  ...   0.000000  0.000499  0.000588   
okay           0     0     0     0  ...   0.000000  0.000008  0.001353   

                1940      1950      1960      1970      1980      1990  \
word                                                                     
soviet      0.011937  0.030687  0.022103  0.019477  0.027580  0.007039   
radio       0.017642  0.014509  0.012027  0.009458  0.010498  0.010467   
phone       0.006885  0.008053  0.012009  0.013640  0.014318  0.018203   
television  0.001981  0.008791  0.011224  0.014161  0.014006  0.010957   
okay        0.003058  0.004406  0.006313  0.008459  0.009622  0.014044   

                2000  
word                  
soviet      0.003077  
radio       0.010118  
phone       0.023380  
television  0.009295  
okay        0.016564  

[5 rows x 24 columns]
In [27]:
dftop.to_csv('coha_top_decades.csv')