#!/usr/bin/env python
# coding: utf-8

# # Emotional Text Analysis
# ## Statistical text analysis for emotional extraction and some generalizations
# #### Alberto Barradas
# [@abcsds](http://github.com/abcsds)

# ## Emotion, and behaviour.
# 
# There are several theories of emotion, all indicate that emotions are a precursor of behaviour.

# ## Models of Emotion

# ### Affect
# Most used model for text analysis. Maps into 2D plane. Cannot map back into emotions.
# ![](img/Circumplex.jpg)

# ### Bio-chemical
# Very precise, closest to biochemical definition of emotions. Not practical.
# ![](img/Loevheim.png)

# ### Ekman's model of emotion
# Widely used in face recognition programs. Widely accepted. Requires face-recognition software, thus very costly.
# ![](img/Ekman.jpg)

# ### Functional
# Based on etological observations. Superset of Ekman's model.
# ![](img/Plutchik.png)
# http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm

# ### The dictionary

# In[19]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


emotions = ["anger",
            "anticipation",
            "disgust",
            "fear",
            "joy",
            "negative",
            "positive",
            "sadness",
            "surprise",
            "trust"]
df = pd.read_csv("dict.csv")
df.head(10)


# In[3]:


df[emotions].sum()


# In[15]:


print(len(df))
df[emotions].sum()/len(df)*100


# In[30]:


fig = plt.figure(figsize=(16, 10))
y_pos = np.arange(len(emotions))
percentage = df[emotions].sum()/len(df)*100
percentage = percentage.tolist()

ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
colors = ["#007C37","#79BF2A","#EBC527","#E66F11","#DB1245","#0C2C30","#F53022","#7D4CA1","#296CAB","#1781AA"]
bars = ax.bar(y_pos, percentage, align='center')
for bar,c in zip(bars,colors):
    bar.set_facecolor(c)

plt.xticks(y_pos, emotions)
plt.ylabel('Percentage')


# ### Applying it
# Using tweepy for twitter acces, but can be done with any text. http://www.tweepy.org/
# 
# Tools used:
# - Tweepy listener for extracting twitter data.
# - Tweepy Stream and OauthHandler class for conectivity
# - Python's json library for extracting text from tweets
# - A csv to python-dictionary reader from the csv library
# - Seaborn for visualizations https://seaborn.pydata.org/

# In[31]:


from tweepy.streaming import StreamListener
from tweepy import Stream
from tweepy import OAuthHandler

import json
from csv import DictReader
import seaborn as sns


# #### Hidden keys :)
# Access twitter with OAuth. https://dev.twitter.com/oauth/overview/application-owner-access-tokens

# In[32]:


import myKeys

api_key = myKeys.api_key
api_secret = myKeys.api_secret
access_token_key = myKeys.access_token_key
access_token_secret = myKeys.access_token_secret


# Read the csv into a python dictionary

# In[33]:


cols = ['anger', 'anticipation', 'disgust', 'fear',
        'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']

dictFile  = 'dict.csv'
mainDict = {}
with open(dictFile) as csvFile:
        reader = DictReader(csvFile)
        for row in reader:
            mainDict[row['Word']] = [int(row[i]) for i in cols]


# #### Create a tweepy listener
# The StreamListener class can be used to handle the incoming tweets. Here every tweet will be given an emotional score in the shape of a 10-valued vector.

# In[34]:


class ColorListener(StreamListener):

    def __init__(self):
        self.tweets = pd.DataFrame(columns=('tweet', 'anger', 'anticipation',
                'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust'))

    def on_data(self, data):
        try:
            tweet = json.loads(data)
            vector = self.score(tweet)
            print(vector)
            row = pd.Series([tweet['text']]+vector, index=['tweet', 'anger', 'anticipation',
                'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust'])
            self.tweets = self.tweets.append(row, ignore_index=True)
        except UnboundLocalError:
            raise UnboundLocalError # Tweet doesn't have text
        except:
            pass
        return True

    def score(self, tweet):
        line = tweet['text'].replace('.','').replace(',','').replace(';','').replace(':','').replace('\t',' ').replace('\n',' ')
        words = line.split(' ')
        score = [0] * 10
        for word in words:
            if word in mainDict:
                for i in range(len(score)):
                    score[i] += mainDict[word][i]
        return score

    def on_error(self, status):
        print("Error: ", status)


# Let's run it:
# We can pick other listening filters.

# In[35]:


cListener = ColorListener()
auth = OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token_key, access_token_secret)

stream = Stream(auth, cListener)

# Start reading stream for english tweets with the color words
stream.filter(languages=['en'], track=['red', 'green','blue'])
# stream.filter(languages=['en'], track=['trump'])


# In[36]:


df = cListener.tweets
print(len(df.index)) # Number of rows


# In[37]:


df.head(10) # How the data looks like


# In[38]:


df.plot(figsize=(16, 6)) # Plot the sentiment as a time series


# In[39]:


df['trust'].plot(figsize=(16, 6))


# In[40]:


df.plot(subplots=True, figsize=(16, 10))


# In[41]:


df['trust'].plot.kde(figsize=(16, 6))


# In[43]:


df.sum()


# In[48]:


fig = plt.figure(figsize=(16, 10))
y_pos = np.arange(len(emotions))
percentage = df[emotions].sum()/len(df)*100
percentage = percentage.tolist()

ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
colors = ["#007C37","#79BF2A","#EBC527","#E66F11","#DB1245","#0C2C30","#F53022","#7D4CA1","#296CAB","#1781AA"]
bars = ax.bar(y_pos, percentage, align='center')
for bar,c in zip(bars,colors):
    bar.set_facecolor(c)

plt.xticks(y_pos, emotions)
plt.ylabel('Percentage')


# In[44]:


cor = df.corr()
cor


# In[45]:


sns.heatmap(cor)


# In[46]:


sns.clustermap(cor)


# In[47]:


from matplotlib.pyplot import figure, show, rc

fig = figure(figsize=(10, 10))
ax = fig.add_axes([0.1, 0.1, 0.8, 0.8], polar=True)

colors = ['#007C37','#79BF2A','#EBC527','#E66F11','#DB1245','#7D4CA1','#296CAB','#1781AA']

N = 8
theta = np.arange(0, 2*np.pi, 2*np.pi/N)-(np.pi/(2*N))
radii = df[['fear','trust','joy','anticipation','anger','disgust','sadness','surprise']].sum()
width = np.pi/N
bars = ax.bar(theta, radii, width=width, bottom=0.0)
for r,bar,c in zip(radii,bars,colors):
    bar.set_facecolor(c)
    bar.set_alpha(1)

show()


# ### Where else has this been used?
# 
# - Client interaciton.
# - Sexual differences in comunication: http://dl.acm.org/citation.cfm?id=2107662
# - Children's bedtime stories: http://www.musicfromtext.com/
# - Mercutio: http://mercutio.albertobarradas.com/

# ### Where can it be used?
# Generalization: Dictionary based lexical analysis. 
# - Colors: http://www.lexichrome.com/#palette
# 
# Correlation with other variables.
# - Correlating with digital profiles: Personality
# - Correlating with digital activity: Motivation
# - Correlating with physical data: Behaviour