import pandas as pd import time from math import ceil import pickle # load pickle of all words and decades and remove those that appear in more than 10 decades df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle") origlen = len(df) origwds = len(df.word.unique()) df = df[df.nonalpha == False] # remove words with nonalphanumeric characters wordcount = pd.DataFrame(df.groupby('word').decade.count()) wordcount = wordcount[wordcount.decade <= 15] df = df[df.word.isin(wordcount.index)] df = df[['word', 'decade', 'pct']] print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen) print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), len(df.word.unique())*100.0/origwds) print df.head(10) #keep words in crossword dictionary, e.g. not proper nouns origlen = len(df) origwds = len(df.word.unique()) import json xwords = json.loads(open('../data_user_pickle_csv/coha_and_xword.json', 'r').read()) df = df[df.word.isin(xwords)] print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen) print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), len(df.word.unique())*100.0/origwds) # keep top 10000 words in terms of max and sum origlen = len(df) origwds = len(df.word.unique()) dfsum = pd.DataFrame(df.groupby('word').pct.sum()) dfsum.sort('pct', ascending=False, inplace=True) dfsum = dfsum[:10000] dfmax = pd.DataFrame(df.groupby('word').pct.max()) dfmax.sort('pct', ascending=False, inplace=True) dfmax = dfmax[:10000] df = df[(df.word.isin(dfsum.index)) | (df.word.isin(dfmax.index))] print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen) print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), len(df.word.unique())*100.0/origwds) # add sum per decade to dfsum series_count = df.groupby('word').decade.count() dfsum['pct_per_decade'] = 0.0 dfsum['decades'] = 0 dfsum['decade_specificity'] = 0.0 for i in range(len(dfsum)): dfsum.pct_per_decade.iloc[i] = (dfsum.pct[i] / series_count[dfsum.index[i]]) dfsum.decades[i] = series_count[dfsum.index[i]] dfsum.decade_specificity[i] = 20 - series_count[dfsum.index[i]] dfsum.sort('pct_per_decade', ascending=False, inplace=True) print dfsum.head(50) # for contrast, let's see proper nouns df_proper = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle") df_proper = df_proper[df_proper.nonalpha == False] # remove words with nonalphanumeric characters wordcount = pd.DataFrame(df_proper.groupby('word').decade.count()) wordcount = wordcount[wordcount.decade <= 15] df_proper = df_proper[df_proper.word.isin(wordcount.index)] df_proper = df_proper[['word', 'decade', 'pct']] df_propersum = pd.DataFrame(df_proper.groupby('word').pct.sum()) df_propersum.sort('pct', ascending=False, inplace=True) df_propersum = df_propersum[:10000] df_propermax = pd.DataFrame(df_proper.groupby('word').pct.max()) df_propermax.sort('pct', ascending=False, inplace=True) df_propermax = df_propermax[:10000] df_proper = df_proper[(df_proper.word.isin(df_propersum.index)) | (df_proper.word.isin(df_propermax.index))] proper_series_count = df_proper.groupby('word').decade.count() df_propersum['pct_per_decade'] = 0.0 for i in range(len(df_propersum)): df_propersum.pct_per_decade.iloc[i] = (df_propersum.pct[i] / proper_series_count[df_propersum.index[i]]) df_propersum.sort('pct_per_decade', ascending=False, inplace=True) df_propersum = df_propersum[~df_propersum.index.isin(dfsum.index)] print df_propersum.head(50) df_propersum[:50].to_csv('coha_top_omitted_proper_nouns.csv') # make pivot table showing in which decades words occurred decades = range(1810, 2010, 10) dftop = dfsum[:50] dftoplookup = df.copy() for decade in decades: dftop[decade] = 0.0 for i in range(len(dftop)): for decade in decades: if len(dftoplookup[(dftoplookup.word == dftop.index[i]) & (dftoplookup.decade == decade)]) > 0: dftop[decade].iloc[i] = dftoplookup[(dftoplookup.word == dftop.index[i]) & (dftoplookup.decade == decade)].pct.iloc[0] print dftop.head() dftop.to_csv('coha_top_decades.csv')