The following code will read several books in .pdf format to later compute the frequency of the words by language.
import os
import pandas as pd
import time
from tqdm import tqdm_notebook as tqdm
# specify the folder's directory where the book files are located
book_dir = './Books'
# create two empty Dataframes to later store the info computed from every book file
count_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])
stat_result = pd.DataFrame(columns=['lang', 'author', 'book_title', 'process_time',
'uniq_words','total_words'])
time_start = time.time()
time_accum = 0
# iterate and read every file by language, author, and title
for language in tqdm(os.listdir(book_dir)):
for author in os.listdir(book_dir + '/' + language):
for title in os.listdir(book_dir + '/' + language + '/' + author):
# this is the resulting path...
title_path = book_dir + '/' + language + '/' + author + '/' + title
# now it will read on every file
with open(title_path, 'r', encoding='utf8') as current_file:
text = current_file.read()
# the following lines clean the book's content for the further analysis
text = text.replace('\n', ' ').replace('\r', ' ')
text = text.lower() # turn every letter into lower case
# remove the most common symbols, marks, and numbers
skip_list = [',', '.', ':', ';', '¿', '?', '¡', '!', '#' '"', "'", '-', '(', ')', '{', '}',
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
for ch in skip_list:
text = text.replace(ch, '')
# create a temporary dataframe for every book title to store and isolate the stats collected
temp_df = pd.DataFrame(columns=['lang', 'author', 'book_title', 'words', 'freq'])
# this loop will count the frequency for every unique word
for word in list(filter(None, text.split(' '))):
if word in temp_df['words'].values:
temp_df.loc[temp_df.words == word, 'freq'] += 1
else:
temp_df.loc[len(temp_df)] = language, author, title.replace('.txt', ''), word, 1
temp_df = temp_df.sort_values('freq', ascending=False) # sort the dataframe in descending order
# collect the data from the current file before moving to the next one
process_time = round(time.time() - time_accum - time_start, 2)
time_accum += process_time
stat_result.loc[len(stat_result)] = language, author, title.replace('.txt', ''), process_time, len(temp_df), sum(temp_df['freq'].values)
# this will save and accumulate the info collected from the temporary dataframe into a different table
count_result = pd.concat([count_result, temp_df], axis=0, ignore_index=True)
# output the results as a .csv file
stat_result.to_csv('stat_summary.csv', sep=',')
count_result.to_csv('word_frequencies.csv', sep=',')
print('\n-----------------------------')
print(stat_result)
print('\n-----------------------------')
print(count_result)
HBox(children=(IntProgress(value=0, max=4), HTML(value='')))
----------------------------- lang author book_title \ 0 english shakespeare A Midsummer Night's Dream 1 english shakespeare Hamlet 2 english shakespeare Macbeth 3 english shakespeare Othello 4 english shakespeare Richard III 5 english shakespeare Romeo and Juliet 6 english shakespeare The Merchant of Venice 7 french chevalier L'ale de sable 8 french chevalier L'enfer et le paradis de l'autre monde 9 french chevalier La capitaine 10 french chevalier La fille des indiens rouges 11 french chevalier La fille du pirate 12 french chevalier Le chasseur noir 13 french chevalier Les derniers Iroquois 14 french de Maupassant Boule de Suif 15 french de Maupassant Claire de Lune 16 french de Maupassant Contes de la Becasse 17 french de Maupassant Euvres completes de Guy de Maupassant 18 french de Maupassant L'inutile beaut 19 french de Maupassant La Main Gauche 20 french de Maupassant La Maison Tellier 21 french de Maupassant La petite roque 22 french de Maupassant Le Horla 23 french diderot Ceci n'est pas un conte 24 french diderot Entretien d'un pare avec ses enfants 25 french diderot L'oiseau blanc 26 french diderot Les deux amis de Bourbonne 27 french diderot Regrets sur ma vieille robe de chambre 28 french sand cora 29 french sand Jacques le fataliste et son maatre .. ... ... ... 72 german shakespeare Romeo und Julia 73 portuguese branco A Filha do Arcediago 74 portuguese branco A Neta do Arcediago 75 portuguese branco A Queda d'um Anjo 76 portuguese branco Agulha em Palheiro 77 portuguese branco Amor de Perdicao 78 portuguese branco Amor de Salvacao 79 portuguese branco Annos de Prosa 80 portuguese branco Carlota Angela 81 portuguese branco Estrellas Funestas 82 portuguese branco Estrellas Propicias 83 portuguese branco Lagrimas Abenaoadas 84 portuguese branco Livro de Consolacao 85 portuguese branco O Olho de Vidro 86 portuguese branco O que fazem mulheres 87 portuguese branco O Regicida 88 portuguese branco Scenas Contemporaneas 89 portuguese dinis A Morgadinha dos Cannaviaes 90 portuguese dinis Os fidalgos da Casa Mourisca 91 portuguese dinis Uma familia ingleza 92 portuguese Queiros A Cidade e as Serras 93 portuguese Queiros A correspondancia de Fradique Mendes 94 portuguese Queiros A Illustre Casa de Ramires 95 portuguese Queiros A Reliquia 96 portuguese Queiros Cartas de Inglaterra 97 portuguese Queiros O crime do padre Amaro 98 portuguese Queiros O Mandarim 99 portuguese Queiros O Primo Bazilio 100 portuguese Queiros Os Maias 101 portuguese shakespeare Hamlet process_time uniq_words total_words 0 81.30 3226 16972 1 145.04 4794 29575 2 86.92 3552 17646 3 131.33 4032 27379 4 164.85 4705 34665 5 136.36 4325 28920 6 99.99 3529 21951 7 498.27 13420 77237 8 238.54 7549 44085 9 281.60 9421 49153 10 468.99 12677 72098 11 325.74 11148 52969 12 317.44 8860 55197 13 368.96 11370 59296 14 216.56 8322 39201 15 142.55 6434 28054 16 200.75 7948 37309 17 353.40 11747 57768 18 222.65 7967 41145 19 197.98 7431 37089 20 274.03 9308 47994 21 241.70 8027 44309 22 244.25 8129 44560 23 50.01 3111 11379 24 51.34 3027 11679 25 111.02 4992 23468 26 37.41 2545 8804 27 21.43 1636 5264 28 59.10 3742 13224 29 587.25 12076 95090 .. ... ... ... 72 111.66 5365 23293 73 488.96 12740 78149 74 342.99 10690 58000 75 333.35 12274 52895 76 275.29 10048 46938 77 296.79 9580 51352 78 322.33 11600 52100 79 436.01 12960 67263 80 299.18 10465 50429 81 305.61 10431 51228 82 271.82 9644 47073 83 284.45 9699 49025 84 369.52 12429 58501 85 283.07 10423 48083 86 254.62 9819 43882 87 335.32 11662 55239 88 320.58 11097 53202 89 1200.59 19715 148738 90 1126.08 17492 144454 91 969.08 17612 121923 92 508.40 14453 71227 93 388.24 13465 56881 94 834.74 17577 107379 95 616.67 15747 84947 96 300.57 11205 48481 97 1129.89 18832 141629 98 116.44 6831 22486 99 900.40 17980 118417 100 1989.46 24453 215271 101 183.22 7206 34327 [102 rows x 6 columns] ----------------------------- lang author book_title words freq 0 english shakespeare A Midsummer Night's Dream the 579 1 english shakespeare A Midsummer Night's Dream and 562 2 english shakespeare A Midsummer Night's Dream i 443 3 english shakespeare A Midsummer Night's Dream to 337 4 english shakespeare A Midsummer Night's Dream you 273 5 english shakespeare A Midsummer Night's Dream of 269 6 english shakespeare A Midsummer Night's Dream a 264 7 english shakespeare A Midsummer Night's Dream in 239 8 english shakespeare A Midsummer Night's Dream my 204 9 english shakespeare A Midsummer Night's Dream is 190 10 english shakespeare A Midsummer Night's Dream that 184 11 english shakespeare A Midsummer Night's Dream with 175 12 english shakespeare A Midsummer Night's Dream me 174 13 english shakespeare A Midsummer Night's Dream not 171 14 english shakespeare A Midsummer Night's Dream this 162 15 english shakespeare A Midsummer Night's Dream her 148 16 english shakespeare A Midsummer Night's Dream for 143 17 english shakespeare A Midsummer Night's Dream it 132 18 english shakespeare A Midsummer Night's Dream your 128 19 english shakespeare A Midsummer Night's Dream but 121 20 english shakespeare A Midsummer Night's Dream thou 118 21 english shakespeare A Midsummer Night's Dream as 115 22 english shakespeare A Midsummer Night's Dream so 113 23 english shakespeare A Midsummer Night's Dream will 111 24 english shakespeare A Midsummer Night's Dream loue 105 25 english shakespeare A Midsummer Night's Dream be 104 26 english shakespeare A Midsummer Night's Dream haue 95 27 english shakespeare A Midsummer Night's Dream his 93 28 english shakespeare A Midsummer Night's Dream all 91 29 english shakespeare A Midsummer Night's Dream no 85 ... ... ... ... ... ... 825219 portuguese shakespeare Hamlet lançarmas 1 825220 portuguese shakespeare Hamlet arrancarme 1 825221 portuguese shakespeare Hamlet representem 1 825222 portuguese shakespeare Hamlet esbofetearme 1 825223 portuguese shakespeare Hamlet attentado 1 825224 portuguese shakespeare Hamlet inacção 1 825225 portuguese shakespeare Hamlet fico 1 825226 portuguese shakespeare Hamlet confusa 1 825227 portuguese shakespeare Hamlet tibia 1 825228 portuguese shakespeare Hamlet ficavam 1 825229 portuguese shakespeare Hamlet vilipendios 1 825230 portuguese shakespeare Hamlet possiveis 1 825231 portuguese shakespeare Hamlet inoffensivo 1 825232 portuguese shakespeare Hamlet fel 1 825233 portuguese shakespeare Hamlet trahese 1 825234 portuguese shakespeare Hamlet espontanea 1 825235 portuguese shakespeare Hamlet perturbaram 1 825236 portuguese shakespeare Hamlet dramaticas 1 825237 portuguese shakespeare Hamlet assistindo 1 825238 portuguese shakespeare Hamlet eila 1 825239 portuguese shakespeare Hamlet pausa 1 825240 portuguese shakespeare Hamlet procuremos 1 825241 portuguese shakespeare Hamlet imprecações 1 825242 portuguese shakespeare Hamlet vãs 1 825243 portuguese shakespeare Hamlet gastar 1 825244 portuguese shakespeare Hamlet instigam 1 825245 portuguese shakespeare Hamlet adulterio 1 825246 portuguese shakespeare Hamlet impudico 1 825247 portuguese shakespeare Hamlet abutres 1 825248 portuguese shakespeare Hamlet hear 1 [825249 rows x 5 columns]
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize = (12,12))
# plot the stat from every book by language
for language in os.listdir(book_dir):
subset = stat_result[stat_result.lang == language] # filter the stat data by language
plt.loglog(subset.total_words, subset.uniq_words, "o", label = language)
plt.legend()
plt.xlabel("Total Number of Words")
plt.ylabel("Number of unique words")
plt.savefig("total_vs_unique_words.png")
plt.show()