import pandas as pd
import time
from math import ceil
import pickle

# load pickle of all words and decades and remove those that appear in more than 10 decades

df = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
origlen = len(df)
origwds = len(df.word.unique())
df = df[df.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df = df[df.word.isin(wordcount.index)]
df = df[['word', 'decade', 'pct']]
print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), 
                                                          len(df.word.unique())*100.0/origwds)
print df.head(10)

#keep words in crossword dictionary, e.g. not proper nouns

origlen = len(df)
origwds = len(df.word.unique())

import json
xwords = json.loads(open('../data_user_pickle_csv/coha_and_xword.json', 'r').read())
df = df[df.word.isin(xwords)]

print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), 
                                                          len(df.word.unique())*100.0/origwds)

# keep top 10000 words in terms of max and sum

origlen = len(df)
origwds = len(df.word.unique())

dfsum = pd.DataFrame(df.groupby('word').pct.sum())
dfsum.sort('pct', ascending=False, inplace=True)
dfsum = dfsum[:10000]
dfmax = pd.DataFrame(df.groupby('word').pct.max())
dfmax.sort('pct', ascending=False, inplace=True)
dfmax = dfmax[:10000]

df = df[(df.word.isin(dfsum.index)) | (df.word.isin(dfmax.index))]

print "{0} records reduced to {1} ({2:0.1f} %)".format(origlen, len(df), len(df)*100.0/origlen)
print "{0} words reduced to {1} ({2:0.1f} %)".format(origwds, len(df.word.unique()), 
                                                          len(df.word.unique())*100.0/origwds)

# add sum per decade to dfsum

series_count = df.groupby('word').decade.count()

dfsum['pct_per_decade'] = 0.0
dfsum['decades'] = 0
dfsum['decade_specificity'] = 0.0

for i in range(len(dfsum)):
    dfsum.pct_per_decade.iloc[i] = (dfsum.pct[i] /
                                    series_count[dfsum.index[i]])
    dfsum.decades[i] = series_count[dfsum.index[i]]
    dfsum.decade_specificity[i] = 20 - series_count[dfsum.index[i]]
    
dfsum.sort('pct_per_decade', ascending=False, inplace=True)
    

print dfsum.head(50)

# for contrast, let's see proper nouns

df_proper = pd.read_pickle("../data_user_pickle_csv/coha_1.pickle")
df_proper = df_proper[df_proper.nonalpha == False] # remove words with nonalphanumeric characters
wordcount = pd.DataFrame(df_proper.groupby('word').decade.count())
wordcount = wordcount[wordcount.decade <= 15]
df_proper = df_proper[df_proper.word.isin(wordcount.index)]
df_proper = df_proper[['word', 'decade', 'pct']]
df_propersum = pd.DataFrame(df_proper.groupby('word').pct.sum())
df_propersum.sort('pct', ascending=False, inplace=True)
df_propersum = df_propersum[:10000]
df_propermax = pd.DataFrame(df_proper.groupby('word').pct.max())
df_propermax.sort('pct', ascending=False, inplace=True)
df_propermax = df_propermax[:10000]
df_proper = df_proper[(df_proper.word.isin(df_propersum.index)) | (df_proper.word.isin(df_propermax.index))]
proper_series_count = df_proper.groupby('word').decade.count()
df_propersum['pct_per_decade'] = 0.0
for i in range(len(df_propersum)):
    df_propersum.pct_per_decade.iloc[i] = (df_propersum.pct[i] /
                                    proper_series_count[df_propersum.index[i]])
df_propersum.sort('pct_per_decade', ascending=False, inplace=True)
df_propersum = df_propersum[~df_propersum.index.isin(dfsum.index)]
print df_propersum.head(50)
df_propersum[:50].to_csv('coha_top_omitted_proper_nouns.csv')

# make pivot table showing in which decades words occurred

decades = range(1810, 2010, 10)
dftop = dfsum[:50]
dftoplookup = df.copy()
for decade in decades:
    dftop[decade] = 0.0
for i in range(len(dftop)):
    for decade in decades:
        if len(dftoplookup[(dftoplookup.word == dftop.index[i]) &
                           (dftoplookup.decade == decade)]) > 0:
            dftop[decade].iloc[i] = dftoplookup[(dftoplookup.word == dftop.index[i]) &
                                                (dftoplookup.decade == decade)].pct.iloc[0]


print dftop.head()

dftop.to_csv('coha_top_decades.csv')