In [24]:
# output_dict = {'speaker': [],
#               'date': [],
#               'start_size': [],
#               'end_size': [],
#               'change_size': [], 
#               'change_size_perc': []}

text_dir = './Corpus_of_Presential_Speeches/'
for _, _, filenames in walk('./Corpus_of_Presential_Speeches/'):
    for file in filenames:
        president = file.split("_")[0]
        entire_filename = text_dir + president + '/' + file
        #check_call(['gzip', entire_filename])
        with open(entire_filename+'.infgen', 'w') as outfile:
            check_call(["./infgen", entire_filename], stdout=outfile)
        
#         with open(entire_filename) as f:
#             file_contents = [line.split() for line in f]
            
#         text = [item for sublist in file_contents[2:] for item in sublist]
#         text = ' '.join(text)
        
#         output_dict['speaker'].append(president)
#         output_dict['date'].append(pd.to_datetime(' '.join(file_contents[1]).split('"')[1]).strftime('%m-%d-%Y'))
#         output_dict['start_size'].append(getsizeof(text))
        
#         compressed_text = compress(text)
#         output_dict['end_size'].append(getsizeof(compressed_text))
#         output_dict['change_size'].append(output_dict['start_size'][-1] - output_dict['end_size'][-1])
#         output_dict['change_size_perc'].append(output_dict['end_size'][-1]*1.0 / output_dict['start_size'][-1])