Datos: Libre elección
Expectativas:
most_similar
) de tres palabras que le llamen la atenciónBonus:
#Análisis de noticias falsas
#preprocesamiento
import re
import pandas as pd
from nltk.corpus import stopwords
stopwords_sp = stopwords.words('spanish')
def pre_procesado(texto):
texto = texto.lower()
texto = re.sub(r"[\W\d_]+", " ", texto)
texto = " ".join([palabra for palabra in texto.split() if len(palabra)>2])
texto = " ".join([palabra for palabra in texto.split() if palabra not in stopwords_sp])
return texto.split()
fake = pd.read_csv("desktop/archivos/onlyfakes1000.csv")
fake['pp'] = fake['text'].apply(lambda texto: pre_procesado(texto))
fake.head()
text | pp | |
---|---|---|
0 | El suceso ha tenido lugar en Brasil. Un adole... | [suceso, lugar, brasil, adolescente, años, mur... |
1 | Estamos en la semana decisiva. Los expertos a... | [semana, decisiva, expertos, aseguran, campaña... |
2 | Estudios científicos hay muchos. Unos nos int... | [estudios, científicos, interesan, concreto, h... |
3 | Ha sucedido en la ciudad de San José de Río P... | [sucedido, ciudad, san, josé, río, preto, bras... |
4 | La fiesta en Sevilla por el vuelco electoral ... | [fiesta, sevilla, vuelco, electoral, alargó, c... |
import gensim.models.word2vec as w2v
%%time
mi_modelo = w2v.Word2Vec(fake['pp'].values,
sg=1, # 1 skip-gram
seed=1, # semilla
size=200, # número de dimensiones
min_count=5,
window=10)
Wall time: 516 ms
#Palabras más parecidas
mi_modelo.wv.most_similar("menas")
[('dicen', 0.9997541904449463), ('nunca', 0.9997429847717285), ('amigos', 0.9997411966323853), ('ayudar', 0.9997411370277405), ('orden', 0.999740481376648), ('haciendo', 0.9997392296791077), ('comunidad', 0.9997387528419495), ('embargo', 0.9997382760047913), ('sino', 0.9997377395629883), ('haber', 0.9997376203536987)]
mi_modelo.wv.most_similar("prensa")
[('localidad', 0.9997602701187134), ('confirmado', 0.9997563362121582), ('algún', 0.9997508525848389), ('formación', 0.9997497797012329), ('hecho', 0.999744713306427), ('siempre', 0.9997433423995972), ('toda', 0.9997411370277405), ('hora', 0.9997395873069763), ('nombre', 0.9997395277023315), ('sexual', 0.9997395277023315)]
mi_modelo.wv.most_similar("corte")
[('viernes', 0.99973464012146), ('algún', 0.9997328519821167), ('caso', 0.9997080564498901), ('gracias', 0.9997074007987976), ('ministerio', 0.9997058510780334), ('nueva', 0.9997047781944275), ('hizo', 0.9996985197067261), ('podría', 0.999698281288147), ('hace', 0.9996973872184753), ('casi', 0.9996916055679321)]
def nearest_similarity_cosmul(start1, end1, end2):
similarities = mi_modelo.wv.most_similar_cosmul(
positive=[end2, start1],
negative=[end1]
)
start2 = similarities[0][0]
print("{0} es a {1}, lo que {2} es a {3}".format(start1, end1, start2, end2))
nearest_similarity_cosmul("menas", "extranjeros", "inmigrantes")
menas es a extranjeros, lo que ningún es a inmigrantes
El modelo tiene un buen resultado al traer palabras parecidas, sin embargo para hacer analogías el corpus no es suficiente y es muy difícil encontrar temas que puedan generar buenas analogías
Decidir los mejores parámetros para el modelo y poder definir cual era el mejor modelo
Visualizar el modelo
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot
vocab = [e[0] for e in mi_modelo.wv.most_similar("ley")]+[e[0] for e in mi_modelo.wv.most_similar("ministerio")]
print(vocab)
['congreso', 'noticia', 'algún', 'vuelta', 'estancia', 'murcia', 'varones', 'pues', 'decreto', 'haber', 'caso', 'hecho', 'próximo', 'niño', 'calle', 'mena', 'medida', 'organización', 'decidió', 'públicas']
X = mi_modelo[mi_modelo.wv.vocab]
matrix = pd.DataFrame(X)
matrix.index = mi_modelo.wv.vocab.keys()
matrix
C:\Users\USER\anaconda3\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
suceso | 0.006212 | 0.031511 | 0.045481 | 0.101416 | -0.031679 | 0.150899 | -0.059783 | -0.014200 | 0.107744 | 0.176405 | ... | 0.050822 | -0.017535 | 0.042748 | -0.073751 | -0.088236 | -0.076950 | -0.062377 | -0.055633 | -0.060737 | -0.058302 |
lugar | 0.005950 | 0.040947 | 0.059403 | 0.136897 | -0.040842 | 0.201386 | -0.075965 | -0.015665 | 0.138248 | 0.228637 | ... | 0.065123 | -0.018993 | 0.060566 | -0.094583 | -0.112451 | -0.098903 | -0.081829 | -0.071580 | -0.077870 | -0.076109 |
brasil | 0.005165 | 0.026123 | 0.037262 | 0.089919 | -0.026611 | 0.130936 | -0.052012 | -0.013126 | 0.092490 | 0.145507 | ... | 0.043393 | -0.012164 | 0.036452 | -0.062901 | -0.074967 | -0.062631 | -0.050171 | -0.049755 | -0.051648 | -0.048988 |
años | 0.008973 | 0.040707 | 0.061044 | 0.140177 | -0.041050 | 0.201412 | -0.080918 | -0.014251 | 0.143369 | 0.234809 | ... | 0.066531 | -0.022884 | 0.063267 | -0.096038 | -0.120391 | -0.097196 | -0.080598 | -0.076261 | -0.079407 | -0.077067 |
después | 0.004588 | 0.037437 | 0.059342 | 0.144849 | -0.044886 | 0.210980 | -0.083985 | -0.015445 | 0.150787 | 0.239436 | ... | 0.070495 | -0.024690 | 0.062590 | -0.098726 | -0.118127 | -0.101004 | -0.084490 | -0.078070 | -0.085354 | -0.076749 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
mercadona | 0.001592 | 0.029226 | 0.041898 | 0.102322 | -0.031841 | 0.150310 | -0.054796 | -0.013188 | 0.104230 | 0.167622 | ... | 0.050685 | -0.017119 | 0.044884 | -0.071129 | -0.086456 | -0.073510 | -0.060821 | -0.052663 | -0.058199 | -0.055017 |
marruecos | 0.005181 | 0.024268 | 0.038444 | 0.091734 | -0.027209 | 0.133286 | -0.050361 | -0.009693 | 0.092897 | 0.150116 | ... | 0.046088 | -0.014870 | 0.037095 | -0.065604 | -0.077990 | -0.062612 | -0.051981 | -0.048857 | -0.052151 | -0.049877 |
italia | 0.003858 | 0.030171 | 0.042547 | 0.097440 | -0.031347 | 0.146198 | -0.056130 | -0.010863 | 0.105478 | 0.166581 | ... | 0.048054 | -0.015512 | 0.045370 | -0.071881 | -0.082600 | -0.072842 | -0.061193 | -0.052945 | -0.057551 | -0.053230 |
usted | 0.003297 | 0.032344 | 0.048360 | 0.108048 | -0.034448 | 0.159390 | -0.064360 | -0.015801 | 0.112862 | 0.186113 | ... | 0.054664 | -0.019894 | 0.049417 | -0.078136 | -0.090361 | -0.079543 | -0.064764 | -0.060393 | -0.062716 | -0.057396 |
jordi | 0.006217 | 0.030590 | 0.045220 | 0.112371 | -0.036519 | 0.166078 | -0.065310 | -0.016198 | 0.118068 | 0.187892 | ... | 0.055352 | -0.017221 | 0.049916 | -0.078550 | -0.095642 | -0.080198 | -0.067379 | -0.059102 | -0.067260 | -0.058912 |
936 rows × 200 columns
pca = PCA(n_components=2)
result = pca.fit_transform(matrix)
result = pd.DataFrame(result)
result.columns = ['X', 'Y']
result['Palabra'] = matrix.index.values
result
X | Y | Palabra | |
---|---|---|---|
0 | 0.137783 | 0.002006 | suceso |
1 | -0.185469 | 0.002270 | lugar |
2 | 0.297975 | -0.001699 | brasil |
3 | -0.208962 | -0.002381 | años |
4 | -0.250687 | -0.002270 | después |
... | ... | ... | ... |
931 | 0.167823 | -0.001028 | mercadona |
932 | 0.273000 | 0.000120 | marruecos |
933 | 0.175010 | 0.001343 | italia |
934 | 0.075135 | 0.001296 | usted |
935 | 0.055082 | 0.000091 | jordi |
936 rows × 3 columns
trace = go.Scatter(x=result['X'].values,
y=result['Y'].values,
text=result['Palabra'].values,
mode='markers')
layout = go.Layout(title="PCA mi modelo")
fig = go.Figure(data=trace, layout=layout)
iplot(fig)