#!/usr/bin/env python
# coding: utf-8
# In[81]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import re
speech_links = pd.read_csv(r'C:\Users\FYD\Documents\GitHub\lda-brazilian-books\src\books\discursos-presidenciais\discursos-presidenciais-links.txt',
delimiter='|')
speech_links.head(2)
# In[57]:
start = time.time()
BASE_URL = 'http://www.itamaraty.gov.br'
full_link = []
speech = []
for link in speech_links['link']:
full_link.append(BASE_URL + link)
response = requests.get(BASE_URL + link)
soup = BeautifulSoup(response.text, 'html.parser')
speech_text = soup.find("div", itemprop="articleBody")
speech.append(speech_text)
print ("Process time: " + str((time.time() - start)))
# In[164]:
df_speech_consolidated = pd.concat([speech_links,
pd.DataFrame(full_link),
pd.DataFrame(speech)], axis=1)
df_speech_consolidated.columns = ['president', 'link', 'full_link', 'text']
df_speech_consolidated.head(5)
# In[82]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]"\|@,;]')
from nltk.corpus import stopwords
#BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('portuguese'))
def clean_text(text):
"""
text: a string
return: modified initial string
"""
text = str(text)
text = text.lower() # lowercase text
text = text.replace("
", "")
text = text.replace("
", "")
text = text.replace("
", "")
text = text.replace("
", "")
text = text.replace("
", "")
text = text.replace("\n", "")
text = text.replace(" < p>", "")
text = text.replace(".< p>", " ")
text = text.replace(".", " ")
text = text.replace("
", "")
text = text.replace("<", "")
text = text.replace(">", "")
text = text.replace("style=/", "")
text = text.replace("br/", "")
text = text.replace("div/", "")
text = text.replace("div", "")
text = text.replace("/p", "")
text = text.replace("...", "")
text = text.replace("text-align", "")
text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
#text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
return text
df_speech_consolidated['text_processed'] = df_speech_consolidated['text'].apply(clean_text)
# In[166]:
df_speech_consolidated.to_csv(r'C:\Users\FYD\Documents\GitHub\lda-brazilian-books\src\books\discursos-presidenciais\df_speech_consolidated.csv',
sep='|',
index_label=None)
# In[167]:
df_speech_consolidated['text_processed'].head(1)[0]
# In[3]:
df_speech_consolidated = pd.read_csv(r'C:\Users\FYD\Documents\GitHub\lda-brazilian-books\src\books\discursos-presidenciais\df_speech_consolidated.csv',
delimiter='|',
index_col=0)
df_speech_consolidated.head(2)
# In[4]:
# Basic counters
print(f'Qty rows: {df_speech_consolidated.shape[0]}, Qty columns: {df_speech_consolidated.shape[1]}')
# In[5]:
# Show stats about the language per artist
df_speech_consolidated.groupby(['president']).size().reset_index()
# In[66]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pyLDAvis
import pyLDAvis.sklearn
import random
import seaborn as sns
import string
from collections import Counter
from PIL import Image
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Generate graphs inline in Jupyter
get_ipython().run_line_magic('matplotlib', 'inline')
# Lock random seeds used by libraries
random.seed(42)
np.random.seed(42)
# Define function to cleanup text by removing
# personal pronouns, stopwords, and puncuation
import spacy
nlp = spacy.load(r"C:\Users\FYD\Anaconda3\Lib\site-packages\pt_core_news_sm\pt_core_news_sm-2.1.0")
punctuations = string.punctuation
# Define default stopwords list
stoplist = spacy.lang.pt.stop_words.STOP_WORDS
# In[73]:
stoplist.update('e')
stoplist.update('que')
stoplist.update('p')
stoplist.update('d')
stoplist.update('the')
stoplist.update('n')
stoplist.update('a')
stoplist.update('acerca')
stoplist.update('adeus')
stoplist.update('agora')
stoplist.update('ainda')
stoplist.update('alem')
stoplist.update('algmas')
stoplist.update('algo')
stoplist.update('algumas')
stoplist.update('alguns')
stoplist.update('ali')
stoplist.update('além')
stoplist.update('ambas')
stoplist.update('ambos')
stoplist.update('ano')
stoplist.update('anos')
stoplist.update('antes')
stoplist.update('ao')
stoplist.update('aonde')
stoplist.update('aos')
stoplist.update('apenas')
stoplist.update('apoio')
stoplist.update('apontar')
stoplist.update('apos')
stoplist.update('após')
stoplist.update('aquela')
stoplist.update('aquelas')
stoplist.update('aquele')
stoplist.update('aqueles')
stoplist.update('aqui')
stoplist.update('aquilo')
stoplist.update('as')
stoplist.update('assim')
stoplist.update('através')
stoplist.update('atrás')
stoplist.update('até')
stoplist.update('aí')
stoplist.update('baixo')
stoplist.update('bastante')
stoplist.update('bem')
stoplist.update('boa')
stoplist.update('boas')
stoplist.update('bom')
stoplist.update('bons')
stoplist.update('breve')
stoplist.update('cada')
stoplist.update('caminho')
stoplist.update('catorze')
stoplist.update('cedo')
stoplist.update('cento')
stoplist.update('certamente')
stoplist.update('certeza')
stoplist.update('cima')
stoplist.update('cinco')
stoplist.update('coisa')
stoplist.update('com')
stoplist.update('como')
stoplist.update('comprido')
stoplist.update('conhecido')
stoplist.update('conselho')
stoplist.update('contra')
stoplist.update('contudo')
stoplist.update('corrente')
stoplist.update('cuja')
stoplist.update('cujas')
stoplist.update('cujo')
stoplist.update('cujos')
stoplist.update('custa')
stoplist.update('cá')
stoplist.update('da')
stoplist.update('daquela')
stoplist.update('daquelas')
stoplist.update('daquele')
stoplist.update('daqueles')
stoplist.update('dar')
stoplist.update('das')
stoplist.update('de')
stoplist.update('debaixo')
stoplist.update('dela')
stoplist.update('delas')
stoplist.update('dele')
stoplist.update('deles')
stoplist.update('demais')
stoplist.update('dentro')
stoplist.update('depois')
stoplist.update('desde')
stoplist.update('desligado')
stoplist.update('dessa')
stoplist.update('dessas')
stoplist.update('desse')
stoplist.update('desses')
stoplist.update('desta')
stoplist.update('destas')
stoplist.update('deste')
stoplist.update('destes')
stoplist.update('deve')
stoplist.update('devem')
stoplist.update('deverá')
stoplist.update('dez')
stoplist.update('dezanove')
stoplist.update('dezasseis')
stoplist.update('dezassete')
stoplist.update('dezoito')
stoplist.update('dia')
stoplist.update('diante')
stoplist.update('direita')
stoplist.update('dispoe')
stoplist.update('dispoem')
stoplist.update('diversa')
stoplist.update('diversas')
stoplist.update('diversos')
stoplist.update('diz')
stoplist.update('dizem')
stoplist.update('dizer')
stoplist.update('do')
stoplist.update('dois')
stoplist.update('dos')
stoplist.update('doze')
stoplist.update('duas')
stoplist.update('durante')
stoplist.update('dá')
stoplist.update('dão')
stoplist.update('dúvida')
stoplist.update('e')
stoplist.update('ela')
stoplist.update('elas')
stoplist.update('ele')
stoplist.update('eles')
stoplist.update('em')
stoplist.update('embora')
stoplist.update('enquanto')
stoplist.update('entao')
stoplist.update('entre')
stoplist.update('então')
stoplist.update('era')
stoplist.update('eram')
stoplist.update('essa')
stoplist.update('essas')
stoplist.update('esse')
stoplist.update('esses')
stoplist.update('esta')
stoplist.update('estado')
stoplist.update('estamos')
stoplist.update('estar')
stoplist.update('estará')
stoplist.update('estas')
stoplist.update('estava')
stoplist.update('estavam')
stoplist.update('este')
stoplist.update('esteja')
stoplist.update('estejam')
stoplist.update('estejamos')
stoplist.update('estes')
stoplist.update('esteve')
stoplist.update('estive')
stoplist.update('estivemos')
stoplist.update('estiver')
stoplist.update('estivera')
stoplist.update('estiveram')
stoplist.update('estiverem')
stoplist.update('estivermos')
stoplist.update('estivesse')
stoplist.update('estivessem')
stoplist.update('estiveste')
stoplist.update('estivestes')
stoplist.update('estivéramos')
stoplist.update('estivéssemos')
stoplist.update('estou')
stoplist.update('está')
stoplist.update('estás')
stoplist.update('estávamos')
stoplist.update('estão')
stoplist.update('eu')
stoplist.update('exemplo')
stoplist.update('falta')
stoplist.update('fará')
stoplist.update('favor')
stoplist.update('faz')
stoplist.update('fazeis')
stoplist.update('fazem')
stoplist.update('fazemos')
stoplist.update('fazer')
stoplist.update('fazes')
stoplist.update('fazia')
stoplist.update('faço')
stoplist.update('fez')
stoplist.update('fim')
stoplist.update('final')
stoplist.update('foi')
stoplist.update('fomos')
stoplist.update('for')
stoplist.update('fora')
stoplist.update('foram')
stoplist.update('forem')
stoplist.update('forma')
stoplist.update('formos')
stoplist.update('fosse')
stoplist.update('fossem')
stoplist.update('foste')
stoplist.update('fostes')
stoplist.update('fui')
stoplist.update('fôramos')
stoplist.update('fôssemos')
stoplist.update('geral')
stoplist.update('grande')
stoplist.update('grandes')
stoplist.update('grupo')
stoplist.update('ha')
stoplist.update('haja')
stoplist.update('hajam')
stoplist.update('hajamos')
stoplist.update('havemos')
stoplist.update('havia')
stoplist.update('hei')
stoplist.update('hoje')
stoplist.update('hora')
stoplist.update('horas')
stoplist.update('houve')
stoplist.update('houvemos')
stoplist.update('houver')
stoplist.update('houvera')
stoplist.update('houveram')
stoplist.update('houverei')
stoplist.update('houverem')
stoplist.update('houveremos')
stoplist.update('houveria')
stoplist.update('houveriam')
stoplist.update('houvermos')
stoplist.update('houverá')
stoplist.update('houverão')
stoplist.update('houveríamos')
stoplist.update('houvesse')
stoplist.update('houvessem')
stoplist.update('houvéramos')
stoplist.update('houvéssemos')
stoplist.update('há')
stoplist.update('hão')
stoplist.update('iniciar')
stoplist.update('inicio')
stoplist.update('ir')
stoplist.update('irá')
stoplist.update('isso')
stoplist.update('ista')
stoplist.update('iste')
stoplist.update('isto')
stoplist.update('já')
stoplist.update('lado')
stoplist.update('lhe')
stoplist.update('lhes')
stoplist.update('ligado')
stoplist.update('local')
stoplist.update('logo')
stoplist.update('longe')
stoplist.update('lugar')
stoplist.update('lá')
stoplist.update('maior')
stoplist.update('maioria')
stoplist.update('maiorias')
stoplist.update('mais')
stoplist.update('mal')
stoplist.update('mas')
stoplist.update('me')
stoplist.update('mediante')
stoplist.update('meio')
stoplist.update('menor')
stoplist.update('menos')
stoplist.update('meses')
stoplist.update('mesma')
stoplist.update('mesmas')
stoplist.update('mesmo')
stoplist.update('mesmos')
stoplist.update('meu')
stoplist.update('meus')
stoplist.update('mil')
stoplist.update('minha')
stoplist.update('minhas')
stoplist.update('momento')
stoplist.update('muito')
stoplist.update('muitos')
stoplist.update('máximo')
stoplist.update('mês')
stoplist.update('na')
stoplist.update('nada')
stoplist.update('nao')
stoplist.update('naquela')
stoplist.update('naquelas')
stoplist.update('naquele')
stoplist.update('naqueles')
stoplist.update('nas')
stoplist.update('nem')
stoplist.update('nenhuma')
stoplist.update('nessa')
stoplist.update('nessas')
stoplist.update('nesse')
stoplist.update('nesses')
stoplist.update('nesta')
stoplist.update('nestas')
stoplist.update('neste')
stoplist.update('nestes')
stoplist.update('no')
stoplist.update('noite')
stoplist.update('nome')
stoplist.update('nos')
stoplist.update('nossa')
stoplist.update('nossas')
stoplist.update('nosso')
stoplist.update('nossos')
stoplist.update('nova')
stoplist.update('novas')
stoplist.update('nove')
stoplist.update('novo')
stoplist.update('novos')
stoplist.update('num')
stoplist.update('numa')
stoplist.update('numas')
stoplist.update('nunca')
stoplist.update('nuns')
stoplist.update('não')
stoplist.update('nível')
stoplist.update('nós')
stoplist.update('número')
stoplist.update('o')
stoplist.update('obra')
stoplist.update('obrigada')
stoplist.update('obrigado')
stoplist.update('oitava')
stoplist.update('oitavo')
stoplist.update('oito')
stoplist.update('onde')
stoplist.update('ontem')
stoplist.update('onze')
stoplist.update('os')
stoplist.update('ou')
stoplist.update('outra')
stoplist.update('outras')
stoplist.update('outro')
stoplist.update('outros')
stoplist.update('para')
stoplist.update('parece')
stoplist.update('parte')
stoplist.update('partir')
stoplist.update('paucas')
stoplist.update('pegar')
stoplist.update('pela')
stoplist.update('pelas')
stoplist.update('pelo')
stoplist.update('pelos')
stoplist.update('perante')
stoplist.update('perto')
stoplist.update('pessoas')
stoplist.update('pode')
stoplist.update('podem')
stoplist.update('poder')
stoplist.update('poderá')
stoplist.update('podia')
stoplist.update('pois')
stoplist.update('ponto')
stoplist.update('pontos')
stoplist.update('por')
stoplist.update('porque')
stoplist.update('porquê')
stoplist.update('portanto')
stoplist.update('posição')
stoplist.update('possivelmente')
stoplist.update('posso')
stoplist.update('possível')
stoplist.update('pouca')
stoplist.update('pouco')
stoplist.update('poucos')
stoplist.update('povo')
stoplist.update('primeira')
stoplist.update('primeiras')
stoplist.update('primeiro')
stoplist.update('primeiros')
stoplist.update('promeiro')
stoplist.update('propios')
stoplist.update('proprio')
stoplist.update('própria')
stoplist.update('próprias')
stoplist.update('próprio')
stoplist.update('próprios')
stoplist.update('próxima')
stoplist.update('próximas')
stoplist.update('próximo')
stoplist.update('próximos')
stoplist.update('puderam')
stoplist.update('pôde')
stoplist.update('põe')
stoplist.update('põem')
stoplist.update('quais')
stoplist.update('qual')
stoplist.update('qualquer')
stoplist.update('quando')
stoplist.update('quanto')
stoplist.update('quarta')
stoplist.update('quarto')
stoplist.update('quatro')
stoplist.update('que')
stoplist.update('quem')
stoplist.update('quer')
stoplist.update('quereis')
stoplist.update('querem')
stoplist.update('queremas')
stoplist.update('queres')
stoplist.update('quero')
stoplist.update('questão')
stoplist.update('quieto')
stoplist.update('quinta')
stoplist.update('quinto')
stoplist.update('quinze')
stoplist.update('quáis')
stoplist.update('quê')
stoplist.update('relação')
stoplist.update('sabe')
stoplist.update('sabem')
stoplist.update('saber')
stoplist.update('se')
stoplist.update('segunda')
stoplist.update('segundo')
stoplist.update('sei')
stoplist.update('seis')
stoplist.update('seja')
stoplist.update('sejam')
stoplist.update('sejamos')
stoplist.update('sem')
stoplist.update('sempre')
stoplist.update('sendo')
stoplist.update('ser')
stoplist.update('serei')
stoplist.update('seremos')
stoplist.update('seria')
stoplist.update('seriam')
stoplist.update('será')
stoplist.update('serão')
stoplist.update('seríamos')
stoplist.update('sete')
stoplist.update('seu')
stoplist.update('seus')
stoplist.update('sexta')
stoplist.update('sexto')
stoplist.update('sim')
stoplist.update('sistema')
stoplist.update('sob')
stoplist.update('sobre')
stoplist.update('sois')
stoplist.update('somente')
stoplist.update('somos')
stoplist.update('sou')
stoplist.update('sua')
stoplist.update('suas')
stoplist.update('são')
stoplist.update('sétima')
stoplist.update('sétimo')
stoplist.update('só')
stoplist.update('tal')
stoplist.update('talvez')
stoplist.update('tambem')
stoplist.update('também')
stoplist.update('tanta')
stoplist.update('tantas')
stoplist.update('tanto')
stoplist.update('tarde')
stoplist.update('te')
stoplist.update('tem')
stoplist.update('temos')
stoplist.update('tempo')
stoplist.update('tendes')
stoplist.update('tenha')
stoplist.update('tenham')
stoplist.update('tenhamos')
stoplist.update('tenho')
stoplist.update('tens')
stoplist.update('tentar')
stoplist.update('tentaram')
stoplist.update('tente')
stoplist.update('tentei')
stoplist.update('ter')
stoplist.update('terceira')
stoplist.update('terceiro')
stoplist.update('terei')
stoplist.update('teremos')
stoplist.update('teria')
stoplist.update('teriam')
stoplist.update('terá')
stoplist.update('terão')
stoplist.update('teríamos')
stoplist.update('teu')
stoplist.update('teus')
stoplist.update('teve')
stoplist.update('tinha')
stoplist.update('tinham')
stoplist.update('tipo')
stoplist.update('tive')
stoplist.update('tivemos')
stoplist.update('tiver')
stoplist.update('tivera')
stoplist.update('tiveram')
stoplist.update('tiverem')
stoplist.update('tivermos')
stoplist.update('tivesse')
stoplist.update('tivessem')
stoplist.update('tiveste')
stoplist.update('tivestes')
stoplist.update('tivéramos')
stoplist.update('tivéssemos')
stoplist.update('toda')
stoplist.update('todas')
stoplist.update('todo')
stoplist.update('todos')
stoplist.update('trabalhar')
stoplist.update('trabalho')
stoplist.update('treze')
stoplist.update('três')
stoplist.update('tu')
stoplist.update('tua')
stoplist.update('tuas')
stoplist.update('tudo')
stoplist.update('tão')
stoplist.update('tém')
stoplist.update('têm')
stoplist.update('tínhamos')
stoplist.update('um')
stoplist.update('uma')
stoplist.update('umas')
stoplist.update('uns')
stoplist.update('usa')
stoplist.update('usar')
stoplist.update('vai')
stoplist.update('vais')
stoplist.update('valor')
stoplist.update('veja')
stoplist.update('vem')
stoplist.update('vens')
stoplist.update('ver')
stoplist.update('verdade')
stoplist.update('verdadeiro')
stoplist.update('vez')
stoplist.update('vezes')
stoplist.update('viagem')
stoplist.update('vindo')
stoplist.update('vinte')
stoplist.update('você')
stoplist.update('vocês')
stoplist.update('vos')
stoplist.update('vossa')
stoplist.update('vossas')
stoplist.update('vosso')
stoplist.update('vossos')
stoplist.update('vários')
stoplist.update('vão')
stoplist.update('vêm')
stoplist.update('vós')
stoplist.update('zero')
stoplist.update('à')
stoplist.update('às')
stoplist.update('área')
stoplist.update('é')
stoplist.update('éramos')
stoplist.update('és')
stoplist.update('último')
# In[ ]:
# In[ ]:
# In[ ]:
# In[88]:
# Data exploration in some specific class to see the most frequent words
def get_word_frequency(artist):
# Word Frequency per Category
def cleanup_text(docs, logging=False):
texts = []
counter = 1
for doc in docs:
if counter % 1000 == 0 and logging:
print("Processed %d out of %d documents." % (counter, len(docs)))
counter += 1
doc = nlp(doc, disable=['parser', 'ner'])
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
tokens = [tok for tok in tokens if tok not in stoplist and tok not in punctuations]
tokens = ' '.join(tokens)
texts.append(tokens)
return pd.Series(texts)
df_text = [text for text in df_speech_consolidated[df_speech_consolidated['president'] == artist]['text_processed']]
df_text_clean = cleanup_text(df_text)
df_text_clean = ' '.join(df_text_clean).split()
df_text_clean_counts = Counter(df_text_clean)
df_common_words = [word[0] for word in df_text_clean_counts.most_common(31)]
df_common_counts = [word[1] for word in df_text_clean_counts.most_common(31)]
df_common_words.pop(0)
df_common_counts.pop(0)
fig = plt.figure(figsize=(18,6))
sns.barplot(x=df_common_words, y=df_common_counts)
plt.title(f'Most Common Words used by {artist}')
plt.xticks(rotation=45)
plt.show()
fig.savefig(f'word_frequency_{artist}.png', format='png', dpi=500)
# In[89]:
get_word_frequency('fhc')
# In[90]:
get_word_frequency('lula')
# In[91]:
get_word_frequency('dilma')
# In[92]:
get_word_frequency('temer')
# In[93]:
get_word_frequency('bolsonaro')
# In[99]:
# Word cloud with most common words
def show_wordcloud(text, president):
# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stoplist, background_color="white").generate(text)
# Display the generated image:
fig = plt.figure(figsize=(25,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(f'Word Cloud for {president}', fontsize=20)
plt.axis("off")
plt.show()
fig.savefig(f'word_cloud_{president}.png', format='png', dpi=500)
def get_wordcloud(df, president):
dataframe = df[df['president'] == president]
# Get all texts and generate a cloud
text = " ".join(review for review in dataframe.text_processed)
show_wordcloud(text, president)
# In[100]:
get_wordcloud(df_speech_consolidated, 'fhc')
# In[101]:
get_wordcloud(df_speech_consolidated, 'lula')
# In[102]:
get_wordcloud(df_speech_consolidated, 'dilma')
# In[103]:
get_wordcloud(df_speech_consolidated, 'temer')
# In[104]:
get_wordcloud(df_speech_consolidated, 'bolsonaro')
# In[105]:
def get_lexical_diversity(df, president):
dataframe = df[df['president'] == president]
# Word stats
full_text_count = pd.DataFrame(Counter(" ".join(dataframe["text_processed"]).split()), index=[0])
full_text_count = full_text_count.T
full_text_count = full_text_count.reset_index()
full_text_count.columns = ['word', 'qty']
# Distinct words to include in numerator
distinct_words = set()
dataframe['text_processed'].str.lower().str.split().apply(distinct_words.update)
total_distinct_words = len(distinct_words)
# All words to include in denominator
total_words = full_text_count['qty'].sum()
lexical_diversity = round((total_distinct_words / total_words),2)
print(f'Lexical Diversity for {president}: {lexical_diversity}')
# In[106]:
get_lexical_diversity(df_speech_consolidated, 'fhc')
# In[107]:
get_lexical_diversity(df_speech_consolidated, 'lula')
# In[108]:
get_lexical_diversity(df_speech_consolidated, 'dilma')
# In[109]:
get_lexical_diversity(df_speech_consolidated, 'temer')
# In[110]:
get_lexical_diversity(df_speech_consolidated, 'bolsonaro')
# In[112]:
def get_word_ngrams_list(df, president, word_ngram):
def get_top_word_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(word_ngram, word_ngram)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_word_n_bigram(df[df['president'] == president]['text_processed'], 20)
df3 = pd.DataFrame(common_words, columns = ['ngram' , 'qty'])
return df3
# In[113]:
get_word_ngrams_list(df_speech_consolidated, 'fhc', 2)
# In[114]:
get_word_ngrams_list(df_speech_consolidated, 'lula', 2)
# In[115]:
get_word_ngrams_list(df_speech_consolidated, 'dilma', 2)
# In[116]:
get_word_ngrams_list(df_speech_consolidated, 'temer', 2)
# In[117]:
get_word_ngrams_list(df_speech_consolidated, 'bolsonaro', 2)
# In[ ]:
# In[118]:
get_word_ngrams_list(df_speech_consolidated, 'fhc', 3)
# In[119]:
get_word_ngrams_list(df_speech_consolidated, 'lula', 3)
# In[120]:
get_word_ngrams_list(df_speech_consolidated, 'dilma', 3)
# In[121]:
get_word_ngrams_list(df_speech_consolidated, 'temer', 3)
# In[122]:
get_word_ngrams_list(df_speech_consolidated, 'bolsonaro', 3)
# In[ ]:
# In[125]:
# LDA Analysis dataframes
df_speech_consolidated_fhc = df_speech_consolidated[df_speech_consolidated['president'] == 'fhc']
df_speech_consolidated_lula = df_speech_consolidated[df_speech_consolidated['president'] == 'lula']
df_speech_consolidated_dilma = df_speech_consolidated[df_speech_consolidated['president'] == 'dilma']
df_speech_consolidated_temer = df_speech_consolidated[df_speech_consolidated['president'] == 'temer']
df_speech_consolidated_bolsonaro = df_speech_consolidated[df_speech_consolidated['president'] == 'bolsonaro']
# In[ ]:
# In[135]:
def get_topics(df, n_components, number_words):
# Convert to list
data = df.text_processed.values.tolist()
# Remove special characters
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]
vectorizer = CountVectorizer(analyzer='word',
#min_df=10,
stop_words=stoplist,
lowercase=True,
token_pattern='[a-zA-Z0-9]{3,}',
)
data_vectorized = vectorizer.fit_transform(data)
# Materialize the sparse data
data_dense = data_vectorized.todense()
# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=n_components,
max_iter=10,
learning_method='online',
random_state=42,
batch_size=10,
evaluate_every = -1,
n_jobs =1,
)
lda_output = lda_model.fit_transform(data_vectorized)
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
# Print the topics found by the LDA model
print("Topics found via LDA:")
lda_model.fit(data_vectorized)
print_topics(lda_model, vectorizer, number_words)
return lda_model, data_vectorized, data, lda_output, vectorizer
# In[ ]:
# In[140]:
lda_model_fhc, data_vectorized_fhc, data_fhc, lda_output_fhc, vectorizer_fhc = \
get_topics(df_speech_consolidated_fhc, n_components=10, number_words=10)
# In[141]:
lda_model_lula, data_vectorized_lula, data_lula, lda_output_lula, vectorizer_lula = \
get_topics(df_speech_consolidated_lula, n_components=10, number_words=10)
# In[142]:
lda_model_dilma, data_vectorized_dilma, data_dilma, lda_output_dilma, vectorizer_dilma = \
get_topics(df_speech_consolidated_dilma, n_components=10, number_words=10)
# In[143]:
lda_model_temer, data_vectorized_temer, data_temer, lda_output_temer, vectorizer_temer = \
get_topics(df_speech_consolidated_temer, n_components=10, number_words=10)
# In[144]:
lda_model_bolsonaro, data_vectorized_bolsonaro, data_bolsonaro, lda_output_bolsonaro, vectorizer_bolsonaro = \
get_topics(df_speech_consolidated_bolsonaro, n_components=10, number_words=10)
# In[ ]:
# In[145]:
def get_topic_per_document(lda_model, data_vectorized, data, lda_output):
# Create Document - Topic Matrix
lda_output = lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
color = 'green' if val > .1 else 'black'
return 'color: {col}'.format(col=color)
def make_bold(val):
weight = 700 if val > .1 else 400
return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics_styled = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
return df_document_topics_styled, df_document_topic
# In[ ]:
# In[146]:
df_topic_per_document_lula_styled, df_topic_per_document_lula = \
get_topic_per_document(lda_model_lula,
data_vectorized_lula,
data_lula,
lda_output_lula)
df_topic_per_document_lula_styled
# In[147]:
df_topic_per_document_dilma_styled, df_topic_per_document_dilma = \
get_topic_per_document(lda_model_dilma,
data_vectorized_dilma,
data_dilma,
lda_output_dilma)
df_topic_per_document_dilma_styled
# In[148]:
df_topic_per_document_temer_styled, df_topic_per_document_temer = \
get_topic_per_document(lda_model_temer,
data_vectorized_temer,
data_temer,
lda_output_temer)
df_topic_per_document_temer_styled
# In[ ]:
# In[149]:
def get_topic_distribution(df_document_topic):
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
total_docs = df_topic_distribution['Num Documents'].sum()
df_topic_distribution['perc_per_topic'] = np.round(((df_topic_distribution['Num Documents'] /total_docs) * 100), 2)
return df_topic_distribution
# In[150]:
get_topic_distribution(df_topic_per_document_lula)
# In[151]:
get_topic_distribution(df_topic_per_document_dilma)
# In[152]:
get_topic_distribution(df_topic_per_document_temer)
# In[ ]:
# In[153]:
def get_lda_plot(lda_model, data_vectorized, vectorizer):
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
return panel
# In[ ]:
# In[154]:
get_lda_plot(lda_model_lula, data_vectorized_lula, vectorizer_lula)
# In[155]:
get_lda_plot(lda_model_dilma, data_vectorized_dilma, vectorizer_dilma)
# In[156]:
get_lda_plot(lda_model_temer, data_vectorized_temer, vectorizer_temer)
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]: