#!/usr/bin/env python # coding: utf-8 # # FILE READER # The following code will read several books in .pdf format to later compute the frequency of the words by language. # In[1]: import os import pandas as pd import time from tqdm import tqdm_notebook as tqdm # specify the folder's directory where the book files are located book_dir = './Books' # create two empty Dataframes to later store the info computed from every book file count_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq']) stat_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'process_time', 'uniq_words','total_words']) time_start = time.time() time_accum = 0 # iterate and read every file by language, author, and title for language in tqdm(os.listdir(book_dir)): for author in os.listdir(book_dir + '/' + language): for title in os.listdir(book_dir + '/' + language + '/' + author): # this is the resulting path... title_path = book_dir + '/' + language + '/' + author + '/' + title # now it will read on every file with open(title_path, 'r', encoding='utf8') as current_file: text = current_file.read() # the following lines clean the book's content for the further analysis text = text.replace('\n', ' ').replace('\r', ' ') text = text.lower() # turn every letter into lower case # remove the most common symbols, marks, and numbers skip_list = [',', '.', ':', ';', '¿', '?', '¡', '!', '#' '"', "'", '-', '(', ')', '{', '}', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] for ch in skip_list: text = text.replace(ch, '') # create a temporary dataframe for every book title to store and isolate the stats collected temp_df = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq']) # this loop will count the frequency for every unique word for word in list(filter(None, text.split(' '))): if word in temp_df['words'].values: temp_df.loc[temp_df.words == word, 'freq'] += 1 else: temp_df.loc[len(temp_df)] = language, author, title.replace('.txt', ''), word, 1 temp_df = temp_df.sort_values('freq', ascending=False) # sort the dataframe in descending order # collect the data from the current file before moving to the next one process_time = round(time.time() - time_accum - time_start, 2) time_accum += process_time stat_result.loc[len(stat_result)] = language, author, title.replace('.txt', ''), process_time, len(temp_df), sum(temp_df['freq'].values) # this will save and accumulate the info collected from the temporary dataframe into a different table count_result = pd.concat([count_result, temp_df], axis=0, ignore_index=True) # output the results as a .csv file stat_result.to_csv('stat_summary.csv', sep=',') count_result.to_csv('word_frequencies.csv', sep=',') print('\n-----------------------------') print(stat_result) print('\n-----------------------------') print(count_result) # #### VISUALIZE THE RESULTS # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.figure(figsize = (12,12)) # plot the stat from every book by language for language in os.listdir(book_dir): subset = stat_result[stat_result.lang == language] # filter the stat data by language plt.loglog(subset.total_words, subset.uniq_words, "o", label = language) plt.legend() plt.xlabel("Total Number of Words") plt.ylabel("Number of unique words") plt.savefig("total_vs_unique_words.png") plt.show() # In[ ]: