#!/usr/bin/env python # coding: utf-8 #

Análisis del usuario @jguaido

Twitter y herramientas de BigData para explorar datos




# #
Roque Leal
#
DataScience

# ## Resumen # # El presente Jupyter Notebook correponde a una exploración de los datos en la red social Twitter asociados al usuario [@jguaido](https://twitter.com/jguaido?lang=es), desde el 07 de abril hasta el presente 30 de mayo del 2019, se realiza el informe como una aproximación científica de diferentes métodos para colectar y analizar datos en Twitter basado en las herramientas disponibles de Python. # In[ ]: import tweepy # Para consumir la API de Twitter import pandas as pd # Para manejo de datos import numpy as np # Para operaciones numéricas # Para ploteo y visualización: import seaborn as sns from IPython.display import display import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: from credentials import * def twitter_config(): # Autenticar y acceder usando llaves: auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET) # Regresar acceso al API: api = tweepy.API(auth) return api # In[3]: extractor = twitter_config() # Creamos una lista de tweets: tweets = extractor.user_timeline(screen_name="jguaido", count=20000) print("Tweets obtenidos: {}.\n".format(len(tweets))) # Imprimimos los primeros 5 tweets: print("Los primeros 5 tweets:\n") for tweet in tweets[:5]: print(tweet.text) print() # In[4]: datos = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['Tweets']) display(datos.head(10)) # In[5]: # Añadimos datos relevantes: datos['len'] = np.array([len(tweet.text) for tweet in tweets]) datos['ID'] = np.array([tweet.id for tweet in tweets]) datos['Creado'] = np.array([tweet.created_at for tweet in tweets]) datos['Fuente'] = np.array([tweet.source for tweet in tweets]) datos['Likes'] = np.array([tweet.favorite_count for tweet in tweets]) datos['RTs'] = np.array([tweet.retweet_count for tweet in tweets]) # In[6]: display(datos.head(10)) # In[7]: # Extraemos el promedio: media = np.mean(datos['len']) print("El promedio de caracteres en tweets: {}".format(media)) # Extraemos el tweet con más FAVs y con más RTs: fav_max = np.max(datos['Likes']) rt_max = np.max(datos['RTs']) fav = datos[datos.Likes == fav_max].index[0] rt = datos[datos.RTs == rt_max].index[0] # Max FAVs: print("El tweet con más likes es: \n{}".format(datos['Tweets'][fav])) print("Número de likes: {}".format(fav_max)) print("{} caracteres.\n".format(datos['len'][fav])) # Max RTs: print("El tweet con más retweets es: \n{}".format(datos['Tweets'][rt])) print("Número de retweets: {}".format(rt_max)) print("{} caracteres.\n".format(datos['len'][rt])) # In[8]: # Creamos series de tiempo para datos: tlen = pd.Series(data=datos['len'].values, index=datos['Creado']) tfav = pd.Series(data=datos['Likes'].values, index=datos['Creado']) tret = pd.Series(data=datos['RTs'].values, index=datos['Creado']) tlen.plot(figsize=(16,4), color='r'); # In[9]: # Visualización de likes vs retweets: tfav.plot(figsize=(16,4), label="Likes", legend=True) tret.plot(figsize=(16,4), label="Retweets", legend=True); # In[10]: fuentes = [] for fuente in datos['Fuente']: if fuente not in fuentes: fuentes.append(fuente) percent = np.zeros(len(fuentes)) for fuente in datos['Fuente']: for index in range(len(fuentes)): if fuente == fuentes[index]: percent[index] += 1 pass percent /= 100 # Pie chart: pie_chart = pd.Series(percent, index=fuentes, name='Fuentes') pie_chart.plot.pie(fontsize=11, autopct='%.2f', figsize=(6, 6)); # In[11]: from textblob import TextBlob import re def limpia_tweet(tweet): return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) def analiza_sentimiento(tweet): analysis = TextBlob(limpia_tweet(tweet)) if analysis.sentiment.polarity > 0: return 1 elif analysis.sentiment.polarity == 0: return 0 else: return -1 datos['AdS'] = np.array([ analiza_sentimiento(tweet) for tweet in datos['Tweets'] ]) display(datos.head(10)) # In[12]: tweets_positivos = [ tweet for index, tweet in enumerate(datos['Tweets']) if datos['AdS'][index] > 0] tweets_neutros = [ tweet for index, tweet in enumerate(datos['Tweets']) if datos['AdS'][index] == 0] tweets_negativos = [ tweet for index, tweet in enumerate(datos['Tweets']) if datos['AdS'][index] < 0] print("Porcentaje de tweets positivos: {}%".format(len(tweets_positivos)*100/len(datos['Tweets']))) print("Porcentaje de tweets neutros: {}%".format(len(tweets_neutros)*100/len(datos['Tweets']))) print("Porcentaje de tweets negativos: {}%".format(len(tweets_negativos)*100/len(datos['Tweets']))) # In[13]: # Data to plot labels = 'Positive Tweets', 'Neutral Tweets','Negative Tweets' sizes = [len(tweets_positivos)*100/len(datos['Tweets']), len(tweets_neutros)*100/len(datos['Tweets']), len(tweets_negativos)*100/len(datos['Tweets'])] colors = ['gold', 'blue', 'black'] explode = (0.07, 0.07, 0.07) # explode 1st slice # Plot plt.figure(figsize=(10,6)) plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140) plt.axis('equal') plt.show() # ## Comparación # # En este apartado se hace una comparación de los sentimientos asociados a los usuarios de Twitter [@jguaido](https://twitter.com/jguaido?lang=es), [@leopoldolopez](https://twitter.com/leopoldolopez?lang=es) y [@NicolasMaduro](https://twitter.com/NicolasMaduro?lang=es) a manera de conocer los últimos 500 mensajes publicados por estos usuarios y el mensaje que estas cuentas desean transmitir. # In[16]: import tweepy import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns sns.set() import pandas as pd import numpy as np from datetime import datetime import math import json # In[17]: from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyzer = SentimentIntensityAnalyzer() # In[18]: from config import (consumer_key, consumer_secret, access_token, access_token_secret) # In[19]: auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) # In[20]: target_terms = ["@jguaido", "@leopoldolopez","@NicolasMaduro"] # In[21]: sentiments = [] desired = 500 # In[22]: for target in target_terms: counter = 0 # Get desired number of tweets (desired = 100 in this case for each target) while (counter < desired): # if desired number of tweets don't get fetched, get more public_tweets = api.search(target, count=(desired-counter), result_type="recent") for tweet in public_tweets['statuses']: # Loop through all tweets results = analyzer.polarity_scores(tweet["text"]) # Run Vader Analysis on each tweet compound = results["compound"] pos = results["pos"] neu = results["neu"] neg = results["neg"] # convert datetime object to string and then strip out the 10 chars(date) Date = str(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")) Date = Date[:19] # extract date and time # Add sentiments for each tweet into an array sentiments.append({"Source": target, "Text": tweet["text"], "DateTime": Date, "Compound": compound, "Pos": pos, "Neg": neu, "Neu": neg, "Tweets Ago": counter+1}) # increment counter counter = counter + 1 sentiments_df = pd.DataFrame.from_dict(sentiments) # Convert sentiments[] to DataFrame sentiments_df # In[23]: markersize = 160 kws = dict(s=markersize, linewidth=.8, edgecolor="bk") sns.set(font_scale = 1.5) max_tweets = sentiments_df["Tweets Ago"].max() pyber_palette = ['#c6fcff','#1b919a','#ff0033','#000099','#ffff66'] # light Sky blue, Green, Red, blue, Yellow # rename column header to match solution example sentiments_df.rename(columns = {'Source':'Media Sources'}, inplace = True) Date = Date[:10] # extract only date from string sns.lmplot(x='Tweets Ago', y='Compound', data=sentiments_df, fit_reg=False, # No regression line should be displayed palette=pyber_palette, scatter_kws=kws, hue= 'Media Sources', size = 10, legend_out=True) #plt.text(8, 45, "Note:\nCircle size correlates with driver count per city", horizontalalignment='left',size='medium', color='green', weight='light') plt.title("Análisis de los Sentimientos ({})".format(Date)) plt.ylabel("Polaridad de Tweet") plt.xlabel("Tweets Ago") plt.xlim(max_tweets+5, -5.0) # margins so plot doesn't end at max values plt.ylim(-1, 1) # Save the figure plt.savefig("Sentiment Analysis of Media Tweets.png") plt.show() # In[27]: jguaido = sentiments_df.groupby('Media Sources')['Compound'].mean()['@jguaido'] leopoldolopez = sentiments_df.groupby('Media Sources')['Compound'].mean()['@leopoldolopez'] NicolasMaduro = sentiments_df.groupby('Media Sources')['Compound'].mean()['@NicolasMaduro'] print("@jguaido = " + str(jguaido) + ", @leopoldolopez = " + str(leopoldolopez) + ", @NicolasMaduro = " + str(NicolasMaduro)) x_labels = ['jguaido','leopoldolopez','NicolasMaduro'] y_sentiments = [jguaido,leopoldolopez,NicolasMaduro] palette = ['#c6fcff','#1b919a','#ff0033','#000099','#ffff66'] # light Sky blue, Green, Red, blue, Yellow x_pos = [0,1,2] # positions for media sources on x axis plt.bar(x_pos, y_sentiments, color=sns.color_palette(palette,5), align='center', width = 1, edgecolor = 'bk', linewidth = .6) plt.xlim(-0.5, len(x_labels)-0.49) # 0.49 instead of 0.5 to show black edgeline of last bar plt.ylim(min(y_sentiments)-0.1, max(y_sentiments)+0.1) # margins of +/-0.1 beyond max/min values plt.xticks(x_pos, x_labels) plt.title("Overall Media Sentiment based on Twitter ({})".format(Date), fontsize=14) plt.ylabel("Tweet Polarity", fontsize=14) for a,b in zip(x_pos, y_sentiments): # show values of each bar in the plot if b <= 0: B = b-0.035 # position text below bar for negative bars. else: # Value of 'b' is preserved as its ploted (bar height) B = b+0.015 # position text above bar for positive bars plt.text(a-0.25, B, str(round(b, 2)), fontsize = 13) # round to 2 decimal places before plotting # Save the figure plt.savefig("Overall Media Sentiment based on Twitter.png") plt.show() # In[ ]: