#!/usr/bin/env python # coding: utf-8 # # Emotional Text Analysis # ## Statistical text analysis for emotional extraction and some generalizations # #### Alberto Barradas # [@abcsds](http://github.com/abcsds) # ## Emotion, and behaviour. # # There are several theories of emotion, all indicate that emotions are a precursor of behaviour. # ## Models of Emotion # ### Affect # Most used model for text analysis. Maps into 2D plane. Cannot map back into emotions. # ![](img/Circumplex.jpg) # ### Bio-chemical # Very precise, closest to biochemical definition of emotions. Not practical. # ![](img/Loevheim.png) # ### Ekman's model of emotion # Widely used in face recognition programs. Widely accepted. Requires face-recognition software, thus very costly. # ![](img/Ekman.jpg) # ### Functional # Based on etological observations. Superset of Ekman's model. # ![](img/Plutchik.png) # http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm # ### The dictionary # In[19]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import string get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: emotions = ["anger", "anticipation", "disgust", "fear", "joy", "negative", "positive", "sadness", "surprise", "trust"] df = pd.read_csv("dict.csv") df.head(10) # In[3]: df[emotions].sum() # In[15]: print(len(df)) df[emotions].sum()/len(df)*100 # In[30]: fig = plt.figure(figsize=(16, 10)) y_pos = np.arange(len(emotions)) percentage = df[emotions].sum()/len(df)*100 percentage = percentage.tolist() ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) colors = ["#007C37","#79BF2A","#EBC527","#E66F11","#DB1245","#0C2C30","#F53022","#7D4CA1","#296CAB","#1781AA"] bars = ax.bar(y_pos, percentage, align='center') for bar,c in zip(bars,colors): bar.set_facecolor(c) plt.xticks(y_pos, emotions) plt.ylabel('Percentage') # ### Applying it # Using tweepy for twitter acces, but can be done with any text. http://www.tweepy.org/ # # Tools used: # - Tweepy listener for extracting twitter data. # - Tweepy Stream and OauthHandler class for conectivity # - Python's json library for extracting text from tweets # - A csv to python-dictionary reader from the csv library # - Seaborn for visualizations https://seaborn.pydata.org/ # In[31]: from tweepy.streaming import StreamListener from tweepy import Stream from tweepy import OAuthHandler import json from csv import DictReader import seaborn as sns # #### Hidden keys :) # Access twitter with OAuth. https://dev.twitter.com/oauth/overview/application-owner-access-tokens # In[32]: import myKeys api_key = myKeys.api_key api_secret = myKeys.api_secret access_token_key = myKeys.access_token_key access_token_secret = myKeys.access_token_secret # Read the csv into a python dictionary # In[33]: cols = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust'] dictFile = 'dict.csv' mainDict = {} with open(dictFile) as csvFile: reader = DictReader(csvFile) for row in reader: mainDict[row['Word']] = [int(row[i]) for i in cols] # #### Create a tweepy listener # The StreamListener class can be used to handle the incoming tweets. Here every tweet will be given an emotional score in the shape of a 10-valued vector. # In[34]: class ColorListener(StreamListener): def __init__(self): self.tweets = pd.DataFrame(columns=('tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust')) def on_data(self, data): try: tweet = json.loads(data) vector = self.score(tweet) print(vector) row = pd.Series([tweet['text']]+vector, index=['tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']) self.tweets = self.tweets.append(row, ignore_index=True) except UnboundLocalError: raise UnboundLocalError # Tweet doesn't have text except: pass return True def score(self, tweet): line = tweet['text'].replace('.','').replace(',','').replace(';','').replace(':','').replace('\t',' ').replace('\n',' ') words = line.split(' ') score = [0] * 10 for word in words: if word in mainDict: for i in range(len(score)): score[i] += mainDict[word][i] return score def on_error(self, status): print("Error: ", status) # Let's run it: # We can pick other listening filters. # In[35]: cListener = ColorListener() auth = OAuthHandler(api_key, api_secret) auth.set_access_token(access_token_key, access_token_secret) stream = Stream(auth, cListener) # Start reading stream for english tweets with the color words stream.filter(languages=['en'], track=['red', 'green','blue']) # stream.filter(languages=['en'], track=['trump']) # In[36]: df = cListener.tweets print(len(df.index)) # Number of rows # In[37]: df.head(10) # How the data looks like # In[38]: df.plot(figsize=(16, 6)) # Plot the sentiment as a time series # In[39]: df['trust'].plot(figsize=(16, 6)) # In[40]: df.plot(subplots=True, figsize=(16, 10)) # In[41]: df['trust'].plot.kde(figsize=(16, 6)) # In[43]: df.sum() # In[48]: fig = plt.figure(figsize=(16, 10)) y_pos = np.arange(len(emotions)) percentage = df[emotions].sum()/len(df)*100 percentage = percentage.tolist() ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) colors = ["#007C37","#79BF2A","#EBC527","#E66F11","#DB1245","#0C2C30","#F53022","#7D4CA1","#296CAB","#1781AA"] bars = ax.bar(y_pos, percentage, align='center') for bar,c in zip(bars,colors): bar.set_facecolor(c) plt.xticks(y_pos, emotions) plt.ylabel('Percentage') # In[44]: cor = df.corr() cor # In[45]: sns.heatmap(cor) # In[46]: sns.clustermap(cor) # In[47]: from matplotlib.pyplot import figure, show, rc fig = figure(figsize=(10, 10)) ax = fig.add_axes([0.1, 0.1, 0.8, 0.8], polar=True) colors = ['#007C37','#79BF2A','#EBC527','#E66F11','#DB1245','#7D4CA1','#296CAB','#1781AA'] N = 8 theta = np.arange(0, 2*np.pi, 2*np.pi/N)-(np.pi/(2*N)) radii = df[['fear','trust','joy','anticipation','anger','disgust','sadness','surprise']].sum() width = np.pi/N bars = ax.bar(theta, radii, width=width, bottom=0.0) for r,bar,c in zip(radii,bars,colors): bar.set_facecolor(c) bar.set_alpha(1) show() # ### Where else has this been used? # # - Client interaciton. # - Sexual differences in comunication: http://dl.acm.org/citation.cfm?id=2107662 # - Children's bedtime stories: http://www.musicfromtext.com/ # - Mercutio: http://mercutio.albertobarradas.com/ # ### Where can it be used? # Generalization: Dictionary based lexical analysis. # - Colors: http://www.lexichrome.com/#palette # # Correlation with other variables. # - Correlating with digital profiles: Personality # - Correlating with digital activity: Motivation # - Correlating with physical data: Behaviour