# output_dict = {'speaker': [],
# 'date': [],
# 'start_size': [],
# 'end_size': [],
# 'change_size': [],
# 'change_size_perc': []}
text_dir = './Corpus_of_Presential_Speeches/'
for _, _, filenames in walk('./Corpus_of_Presential_Speeches/'):
for file in filenames:
president = file.split("_")[0]
entire_filename = text_dir + president + '/' + file
#check_call(['gzip', entire_filename])
with open(entire_filename+'.infgen', 'w') as outfile:
check_call(["./infgen", entire_filename], stdout=outfile)
# with open(entire_filename) as f:
# file_contents = [line.split() for line in f]
# text = [item for sublist in file_contents[2:] for item in sublist]
# text = ' '.join(text)
# output_dict['speaker'].append(president)
# output_dict['date'].append(pd.to_datetime(' '.join(file_contents[1]).split('"')[1]).strftime('%m-%d-%Y'))
# output_dict['start_size'].append(getsizeof(text))
# compressed_text = compress(text)
# output_dict['end_size'].append(getsizeof(compressed_text))
# output_dict['change_size'].append(output_dict['start_size'][-1] - output_dict['end_size'][-1])
# output_dict['change_size_perc'].append(output_dict['end_size'][-1]*1.0 / output_dict['start_size'][-1])