#!/usr/bin/env python # coding: utf-8 # In[1]: import tweepy # for Twitter's API import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from IPython.display import display get_ipython().run_line_magic('matplotlib', 'inline') # Reproduced from: https://dev.to/rodolfoferro/sentiment-analysis-on-trumpss-tweets-using-python- # In[2]: # Twitter App access keys for the Twitter API # Don't show your keys to anyone! # Consume: CONSUMER_KEY = 'consumer_key' CONSUMER_SECRET = 'consumer_secret' # Access: ACCESS_TOKEN = 'access_token' ACCESS_SECRET = 'access_secret' # In[3]: # set up the twitter API def twitter_setup(): # Authentication and access using user-specific keys auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET) # Return API with authentication api = tweepy.API(auth) return api # In[31]: # Thanks to https://gist.github.com/yanofsky/5436496 for the code on how to fetch as many tweets as possible # However, the Twitter API limits you to the latest ~ 3240 tweets # Create object to get tweets get_tweets = twitter_setup() tweets = [] # Get most recent 200 tweets and print new_tweets = get_tweets.user_timeline(screen_name = "JustinTrudeau", count = 1) tweets.extend(new_tweets) # Save the id of the oldest tweet, less one oldest = tweets[-1].id - 1 # Keep getting tweets until there are no more- or in this case, until we reach teh limit while len(new_tweets) > 0: # All subsequent requests use the max_id parameter, to prevent getting duplicates new_tweets = get_tweets.user_timeline(screen_name = "JustinTrudeau", count = 200, max_id = oldest) # Save most recent tweets.extend(new_tweets) # Update id of the oldest tweet, less one oldest = tweets[-1].id - 1 print("...%s tweets downloaded so far" % (len(tweets))) print("Number of tweets extracted: {}.\n".format(len(tweets))) # Print most recent just to check print("Most recent 5 tweets:\n") for tweet in tweets[:5]: print(tweet.text) print() # In[36]: # Create pandas dataframe to store the tweets data_temp = pd.DataFrame(data = [tweet.text for tweet in tweets], columns = ["Tweets"]) # Can print info about the tweets: id, created_at, source, favorite_count, retweet_count, geo, coordinates, entities data_temp['Length'] = np.array([len(tweet.text) for tweet in tweets]) data_temp['Date'] = np.array([tweet.created_at for tweet in tweets]) data_temp['Source'] = np.array([tweet.source for tweet in tweets]) data_temp['Favourites'] = np.array([tweet.favorite_count for tweet in tweets]) data_temp['RTs'] = np.array([tweet.retweet_count for tweet in tweets]) # Display of first 10 elements from dataframe: display(data_temp.head(10)) # In[37]: # Two things to do: # First, get rid of any retweets- we're more interested in what Trudeau actually tweets himself # Second, we need to filter into English and French # 1. Get rid of retweets def remove_retweets(df): return df[df.Tweets.str.contains("RT") == False] # Get rid of RTs data_no_rt = remove_retweets(data_temp) # 2. Detect language from langdetect import detect_langs def isEnglish(string): string_ = detect_langs(string) for item in string_: if item.lang == "en": return True return False def isFrench(string): string_ = detect_langs(string) for item in string_: if item.lang == "fr": return True return False # Method to get English or French tweets def get_en(df): return df[df['Tweets'].apply(isEnglish) == True] def get_fr(df): return df[df['Tweets'].apply(isFrench) == True] # Now let's actually separate the tweets into our final data frames data_en = get_en(data_no_rt) data_fr = get_fr(data_no_rt) display(data_en.head(5)) display(data_fr.head(5)) # In[39]: # Now that we've removed the retweets and sorted the tweets into English and French, we can do some exploratory data analysis # First get mean length mean_en = np.mean(data_en['Length']) mean_fr = np.mean(data_fr['Length']) print("Average tweet length in English: {}".format(mean_en)) print("Average tweet length in French: {}".format(mean_fr)) # In[44]: # Next let's look at tweets with most faves and RTs # Write method to make it prettier :) def get_faves_rts(data, language): max_faves = np.max(data["Favourites"]) max_rt = np.max(data["RTs"]) favourites = data[data.Favourites == max_faves].index[0] rt = data[data.RTs == max_rt].index[0] print("The tweet with the most favourites in " + language + " is: \n{}".format(data['Tweets'][favourites])) print("Number of favourites: {}".format(max_faves)) print("{} characters.\n".format(data['Length'][favourites])) print("The tweet with the most retweets in " + language + " is: \n{}".format(data['Tweets'][rt])) print("Number of retweets: {}".format(max_rt)) print("{} characters.\n".format(data['Length'][rt])) get_faves_rts(data_en, "English") get_faves_rts(data_fr, "French") # In[45]: # Use fivethirtyeight style plots import matplotlib.style as style style.use('fivethirtyeight') # In[46]: # Look at the length, likes, and RT's as a time series # English tlen_en = pd.Series(data = data_en['Length'].values, index = data_en['Date']) tlikes_en = pd.Series(data = data_en['Favourites'].values, index = data_en['Date']) trt_en = pd.Series(data = data_en['RTs'].values, index = data_en['Date']) # French tlen_fr = pd.Series(data = data_fr['Length'].values, index = data_fr['Date']) tlikes_fr = pd.Series(data = data_fr['Favourites'].values, index = data_fr['Date']) trt_fr = pd.Series(data = data_fr['RTs'].values, index = data_fr['Date']) # In[47]: # Now let's look at the length of tweets # Let's make a time series tlen_plot = tlen_en.plot(figsize = (20,10), label = "Length (English)", legend = True) tlen_fr.plot(figsize = (20,10), label = "Length (French)", legend = True, ax = tlen_plot) tlen_plot.text(x = "2016-12", y = 165, s = "Nothing interesting here", fontsize = 35, alpha = 0.85, weight = "bold") tlen_plot.text(x = "2016-12", y = 154, s = "Length of Trudeau's tweets in English and French, December 2016 to October 2017 ", fontsize = 30, alpha = 0.85 ) tlen_plot.tick_params(labelsize = 25) tlen_plot.text(x = "2016-11-10", y = 100, s = "Tweet Length", fontsize = 25, rotation = "vertical", alpha = 0.7) tlen_plot.legend(fontsize = 25, loc = 3, ncol = 2) tlen_plot.xaxis.label.set_visible(False) # There's nothing that jumps out at me about the length of the tweets in English and French. Next let's look at favourites and retweets. # In[48]: # Due to the huge discrepency in activity bewteen the English and French tweets, # I'll plot the English favourites and retweets together t_en_plot = tlikes_en.plot(figsize = (20,10), label = "Favourties (English)", legend = True) trt_en.plot(figsize = (20,10), label = "Retweets (English)", legend = True, ax = t_en_plot) t_en_plot.text(x = "2016-12", y = 900000, s = "#WelcomeToCanada", fontsize = 35, alpha = 0.85, weight = "bold") t_en_plot.text(x = "2016-12", y = 820000, s = "Trudeau's most favourited and retweeted in English, December 2016 to October 2017", fontsize = 30, alpha = 0.85 ) t_en_plot.tick_params(labelsize = 25) t_en_plot.text(x = "2016-10-30", y = 650000, s = "Favourites and Retweets", fontsize = 25, rotation = "vertical", alpha = 0.7) t_en_plot.legend(fontsize = 25, loc = 1, ncol = 1) t_en_plot.xaxis.label.set_visible(False) # We knew from above that the tweet with the most retweets and favourites was "To those fleeing persecution, terror & war, Canadians will welcome you, regardless of your faith. Diversity is our strength #WelcomeToCanada," with 772813 favourites, and 420965 retweets. What wasn't clear, though, was how much more action this particular tweet got! I'd say this says something about Canadians- we're very welcoming! # In[49]: # French t_fr_plot = tlikes_fr.plot(figsize = (20,10), label = "Favourites (French)", legend = True) trt_fr.plot(figsize = (20,10), label = "Retweets (French)", legend = True, ax = t_fr_plot) t_fr_plot.text(x = "2016-12", y = 33000, s = "L'amour, c'est l'amour", fontsize = 35, alpha = 0.85, weight = "bold") t_fr_plot.text(x = "2016-12", y = 30500, s = "Trudeau's most favourited and retweeted in French, December 2016 to October 2017", fontsize = 30, alpha = 0.85 ) t_fr_plot.tick_params(labelsize = 25) t_fr_plot.text(x = "2016-10-30", y = 22000, s = "Favourites and Retweets", fontsize = 25, rotation = "vertical", alpha = 0.7) t_fr_plot.legend(fontsize = 25, loc = 1, ncol = 1) t_fr_plot.xaxis.label.set_visible(False) # Again, we knew that "L'amour, c'est l'amour. #PrideTO" got the most favourites, at 28302. It's interesting to see that the tweet on welcoming refugees got more retweets in French. Next, let's look at the source of the tweets. # In[50]: # Helper methods def get_source(data): sources = [] for source in data["Source"]: if source not in sources: sources.append(source) return sources # numpy vector mapped to labels def source_percentage(data, sources): percent = np.zeros(len(sources)) for source in data["Source"]: for index in range(len(sources)): if source == sources[index]: percent[index] += 1 pass percent = percent / 100 return percent # In[51]: # Now let's actually get the sources sources_en = get_source(data_en) sources_fr = get_source(data_fr) percent_en = source_percentage(data_en, sources_en) percent_fr = source_percentage(data_fr, sources_fr) # Plot the pie chart for English tweets first pie_chart_en = pd.Series(percent_en, index=sources_en, name = '') pie_en = pie_chart_en.plot.pie(fontsize= 15, autopct='%.2f', figsize=(5, 5), legend = False); pie_en.set_title("Most English tweets are from the web", weight = "bold", loc = "left") # In[52]: # Let's check out the source of the French tweets too pie_chart_fr = pd.Series(percent_fr, index = sources_fr, name = "") pie_fr = pie_chart_fr.plot.pie(fontsize = 15, autopct = "%.2f", figsize = (5,5), legend = False); pie_fr.set_title("And most French tweets are from the web, too", weight = "bold", loc = "left") # Nothing too crazy here- the majority of tweets are from the Twitter web client in both languages. For some reason, the percentages are slightly different in French versus English- I think this could be due to the filtering/cleaning I did, or just due to what Trudeau tweeted. # # Now that we've did a bit of basic statistical exploration, let's look at the sentiment. # In[53]: # Now time to analyze the sentiment # We'll start with using VADER and the try using TextBlob # Not sure how well VADER works for French, but TextBlob-Fr works for French for sure! # 1. VADER from nltk.sentiment.vader import SentimentIntensityAnalyzer def sent_vader(tweet): get_sentiment = SentimentIntensityAnalyzer() sentiment = get_sentiment.polarity_scores(tweet) return sentiment["compound"] # 2. TextBlob from textblob import TextBlob def textblob_en(tweet): txt_blob = TextBlob(tweet) return txt_blob.sentiment.polarity # French- returns a tuple (polarity, subjectivity) # For some reason this is different than the English version from textblob_fr import PatternTagger, PatternAnalyzer def textblob_fr(tweet): txt_blob = TextBlob(tweet, pos_tagger=PatternTagger(), analyzer=PatternAnalyzer()) return txt_blob.sentiment[0] # In[54]: # Get the sentiment in both languages sent_col = data_en["Tweets"].apply(sent_vader) data_en = data_en.assign(VADER = sent_col) sent_col = data_fr["Tweets"].apply(sent_vader) data_fr = data_fr.assign(VADER = sent_col) # Time series for plotting vader_en = pd.Series(data = data_en['VADER'].values, index = data_en["Date"]) vader_fr = pd.Series(data = data_fr['VADER'].values, index = data_fr["Date"]) # In[55]: # Get the mean sentiment from December 2016 to September 2017 vader_en_mean = data_en["VADER"].mean() vader_fr_mean = data_fr["VADER"].mean() print("Mean sentiment for English tweets using VADER: " + str(vader_en_mean)) print("Mean sentiment for French tweets using VADER: " + str(vader_fr_mean)) # Trudeau's English tweets seem to be much more positive than his French ones! # In[56]: # Let's look at the sentiment of tweets in English and French, according to VADER vader_plot = vader_en.plot(figsize=(20,10), label = "Sentiment (English)", legend = True) vader_fr.plot(figsize=(20,10), label = "Sentiment (French)", legend = True) vader_plot.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7) vader_plot.text(x = "2016-12", y = "1.32", s = 'VADER: Generally positive in English, more neutral in French', fontsize = 35, alpha = .85, weight = "bold") vader_plot.text(x = "2016-12", y = "1.15", s = 'Sentiment of Trudeau\'s Tweets, December 2016 to October 2017', fontsize = 30, alpha = .85) vader_plot.xaxis.label.set_visible(False) vader_plot.tick_params(labelsize = 25) vader_plot.text(x = "2016-11", y = -0.05, s = "Negative Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) vader_plot.text(x = "2016-11", y = 1, s = "Positive Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) vader_plot.legend(fontsize = 25, loc = 3, ncol = 2) # In[57]: # Now TextBlob # English sent_col = data_en["Tweets"].apply(textblob_en) data_en = data_en.assign(TextBlob = sent_col) # French sent_col = data_fr["Tweets"].apply(textblob_fr) data_fr = data_fr.assign(TextBlob = sent_col) # Time series for plotting textblob_en = pd.Series(data = data_en['TextBlob'].values, index = data_en["Date"]) textblob_fr = pd.Series(data = data_fr['TextBlob'].values, index = data_fr["Date"]) # In[58]: # Get mean sentiment in both languages textblob_en_mean = data_en["TextBlob"].mean() textblob_fr_mean = data_fr["TextBlob"].mean() print("Mean sentiment for English tweets using TextBlob: " + str(textblob_en_mean)) print("Mean sentiment for French tweets using TextBlob: " + str(textblob_fr_mean)) # The mean sentiment of tweets in both languages are much closer according to TextBlob than VADER. Maybe this could be because I used TextBlob-Fr that actually is made for French, so it could be more accurate. # In[59]: # Let's look at the sentiment of tweets in English and French, according to TextBlob textblob_plot = textblob_en.plot(figsize=(20,10), label = "Sentiment (English)", legend = True) textblob_fr.plot(figsize=(20,10), label = "Sentiment (French)", legend = True) textblob_plot.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7) textblob_plot.text(x = "2016-12", y = "1.32", s = 'TextBlob: More positive in both languages:', fontsize = 35, alpha = .85, weight = "bold") textblob_plot.text(x = "2016-12", y = "1.15", s = 'Sentiment of Trudeau\'s tweets, December 2016 to October 2017', fontsize = 30, alpha = .85) textblob_plot.tick_params(labelsize = 25) textblob_plot.text(x = "2016-11", y = -0.05, s = "Negative Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) textblob_plot.text(x = "2016-11", y = 1, s = "Positive Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) textblob_plot.legend(fontsize = 25, loc = 3, ncol = 2) # In[60]: # Textblob seems to assign more positive sentiment to tweets than VADER- I wonder why? # Let's compare the output of VADER and textblob for English, and then for French en_plot = vader_en.plot(figsize=(20,10), label = "VADER", legend = True) textblob_en.plot(figsize=(20,10), label = "TextBlob", legend = True) en_plot.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7) en_plot.text(x = "2016-12", y = "1.32", s = 'TextBlob assigns more positive sentiment', fontsize = 35, alpha = .85, weight = "bold") en_plot.text(x = "2016-12", y = "1.15", s = 'Sentiment of Trudeau\'s tweets in English, December 2016 to October 2017', fontsize = 30, alpha = .85) en_plot.xaxis.label.set_visible(False) en_plot.tick_params(labelsize = 25) en_plot.text(x = "2016-11", y = -0.05, s = "Negative Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) en_plot.text(x = "2016-11", y = 1, s = "Positive Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) en_plot.legend(fontsize = 25, loc = 3, ncol = 2) # In[61]: # French comparison of VADEr vs TextBlob fr_plot = vader_fr.plot(figsize=(20,10), label = "VADER", legend = True) textblob_fr.plot(figsize=(20,10), label = "TextBlob", legend = True) fr_plot.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7) fr_plot.text(x = "2016-12", y = "1.32", s = 'VADER and TextBlob show more similarities', fontsize = 35, alpha = .85, weight = "bold") fr_plot.text(x = "2016-12", y = "1.15", s = 'Sentiment of Trudeau\'s tweets in French, December 2016 to October 2017', fontsize = 30, alpha = .85) fr_plot.xaxis.label.set_visible(False) fr_plot.tick_params(labelsize = 25) fr_plot.text(x = "2016-11", y = -0.05, s = "Negative Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) fr_plot.text(x = "2016-11", y = 1, s = "Positive Sentiment", fontsize = 25, rotation = "vertical", alpha = 0.7) fr_plot.legend(fontsize = 25, loc = 3, ncol = 2) # In[62]: # Next look at sentiment per month # Currently have a timestamp object of the form yyyy-mm-dd hh:mm:ss # Need to group data by year, then month- so use a method from datetime import datetime def get_month_year(tweet_date): month = str(tweet_date.month) year = str(tweet_date.year) mth_yr = year + '-' + month return pd.to_datetime(mth_yr).strftime('%Y-%m') # Plotting and grouping method def monthly_sentiment(data, nlp_method): month_year_column = data["Date"].apply(get_month_year) data = data.assign(MonthYear = month_year_column) group = data.groupby(["MonthYear"]) return group.aggregate({nlp_method:np.mean}) # In[63]: # Group en_vader = monthly_sentiment(data_en, "VADER") en_textblob = monthly_sentiment(data_en, "TextBlob") fr_vader = monthly_sentiment(data_fr, "VADER") fr_textblob = monthly_sentiment(data_fr, "TextBlob") # Rename columns en_vader.columns = ["Vader (English)"] fr_vader.columns = ["Vader (French)"] en_textblob.columns = ["TextBlob (English)"] fr_textblob.columns = ["TextBlob (French)"] # In[ ]: # Save as CSV optionally #data_en.to_csv("Trudeau_Tweets_Oct2_English.csv", index = True) #data_fr.to_csv("Trudeau_Tweets_Oct2_French.csv", index = True) # In[64]: # The data is in a data frame- need to convert from pandas.core.frame.Dataframe to pandas.core.series.Series # Not sure if this is entirely necessary s_en_vader = en_vader.ix[:,0] s_fr_vader = fr_vader.ix[:,0] s_en_textblob = en_textblob.ix[:,0] s_fr_textblob = fr_textblob.ix[:,0] # In[65]: # Plot all together- this is essentially plotting 4 separate data frames together in one figure # Now we can see how the sentiment of Trudeau's tweets has changed, in both languages, with both VADER and TextBlob ax = s_en_vader.plot(figsize=(20,10), legend = True) s_fr_vader.plot(ax=ax, legend = True) s_en_textblob.plot(ax=ax, legend = True) s_fr_textblob.plot(ax=ax, legend = True) ax.axhline(y = 0, color = 'black', linewidth = 1.3, alpha = .7) ax.xaxis.label.set_visible(False) ax.yaxis.label.set_visible(True) ax.tick_params(labelsize = 25) plt.xticks(rotation=45); plt.suptitle("Trudeau's tweeks look more positive in English -- according to VADER ", fontsize = 35, weight = "bold", alpha = 0.85) plt.title("Monthly averaged sentiment in English and French, using VADER and TextBlob", alpha = 0.85, fontsize = 30, loc = "left") ax.legend(fontsize = 25, loc = 3, ncol = 4) ax.set_ylabel("Monthly Averaged Sentiment", fontsize = 25, alpha = 0.7) # It's interesting that the English sentiment comes across as more positive, using both VADER and TextBlob, compared to the French tweets. French is a beautiful language, but is English just more enthusiastic? Or is that just how Trudeau tends to tweet, or can this be attributed to his command of each language and what he's more used to? It's good to see that the sentiment generally follows a similar pattern, in both languages, with both text processing tools. It's also intersting to see the drop off in sentiment in October- it makes sense given the awful events that have happened in the past two days both in Edmonton and Las Vegas. We'll have to wait to see if this downward trend continues. Hopefully not! But if it does, it could say two things: First, it could be a reflection of world events. Or, it could mean Trudeau is feeling worn down as Prime Minister. # In[ ]: