#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '%.2f' % x)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['savefig.dpi'] = 150
plt.style.use('ggplot')

import seaborn as sns
sns.set(color_codes=True)
sns.set_palette(sns.color_palette("husl", 8));
sns.mpl.rc("figure", figsize=(8,5))
sns.set_style('whitegrid')


# In[2]:


cd ../


# In[3]:


#read the data -> pn: published news, tn: tweeted news
pn = pd.read_csv('data/pb-sp.csv',encoding='utf-8')
tn = pd.read_csv('data/tw-sp.csv',encoding='utf-8')
outlets = sorted(pn['outlet'].unique().tolist())
pn['sp'] = pn['posemo']-pn['negemo']
tn['sp'] = tn['posemo']-tn['negemo']
tn['sp_t'] = tn['posemo_t']-tn['negemo_t']


# In[5]:


#sports
tn[tn.cat=='sports'][['rt','posemo','negemo']].corr()


# In[13]:


tn[tn.cat.isin(['politics','middleeast'])][['rt','posemo','negemo']].corr()


# In[4]:


#news tweeted multiple types are grouped into two
df4 = tn.groupby('href').filter(lambda g: len(g)>3) #news tweeted at least four times
more_rt = df4.groupby('href').apply(lambda g: g.sort('rt')[['rt','sp_t']].iloc[1:]) # all the tweet excluding the least RTed
less_rt = df4.groupby('href').apply(lambda g: g.sort('rt')[['rt','sp_t']].iloc[0]) #least retweeted tweets
print(more_rt.sp_t.mean(),less_rt.sp_t.mean())


# In[5]:


df = tn.groupby('href').filter(lambda g: len(g)==3) #news tweeted exactly three times
grouped = df.groupby('href').apply(lambda g: g.sort('rt').reset_index())[['rt','sp_t','cat']]
unstacked = grouped.unstack()
ax = unstacked.sp_t.mean()[:4].plot(kind='bar')
ax.set(xlabel='Retweet order (0 is the least retweeted group)',ylabel='Mean Tweet Sentiment',title='Tweet Sentiment vs RT Groups');


# In[6]:


#Sample size for each group
#pd.DataFrame([unstacked.sp_t[[i]].dropna().shape[0] for i in range(unstacked.sp_t.columns.shape[0])],columns=['N'])
unstacked


# In[4]:


df4 = tn.groupby('href').filter(lambda g: len(g)>3)
gs = gridspec.GridSpec(3, 3)
axs = [plt.subplot(s) for s in (gs[0,0],gs[0,1],gs[0,2],gs[1,0],gs[1,1],gs[1,2],gs[2,:2],gs[2,2])]
for i,o in enumerate(df4.outlet.unique()):
    axs[i].set_title(o)
    if o == 'CNN' or o =='ABC': axs[i].set_ylim([0,3000])
    if o == 'NYT': axs[i].set_ylim([0,2000])
    if o == 'WPOST': axs[i].set_ylim([0,1000])
    g = sns.stripplot(x="href", y="rt",data=df4[df4.outlet==o],ax=axs[i],jitter=True,size=4)
    ylabel = 'Retweet' if i%3 == 0 else ''
    xlabel = 'Tweeted News' if i>=6 else ''
    axs[i].set(xlabel=xlabel,ylabel=ylabel,ylim=0,xticks=[])

f = plt.gcf()
f.set_size_inches(10, 7, forward=True)
f.suptitle('Retweet Counts of The News Tweeted at Least Four Times',fontsize=18, fontweight='bold')
f.savefig('figs/rt-news-stripplot.png', bbox_inches='tight')


# In[5]:


#retweet stats
tw = pd.read_csv('data/LIWC/LIWC2015 Results (tweet-texts).csv',encoding='utf-8')
nan = tw[tw.url.isnull()]
url = tw[~tw.url.isnull()] 
pd.DataFrame({'no url':nan.rt.describe(),'any url':url.rt.describe(),'news url':tn.rt.describe()})


# In[14]:


# Retweeted News (NOTE: THIS TAKES ~ 3min 32s)
rn = pd.concat([pd.DataFrame([row[1]]*int(1 + row[1].rt/50)) for row in tn.iterrows()])


# In[6]:


#sentiment polarity stats
pd.DataFrame({'Published':pn.sp.describe(),'Tweeted':tn.sp.describe(),'Retweeted':rn.sp.describe()})


# In[7]:


# we can filter the categories by increasing the number of news required per category
filtr = 100
df = tn.groupby('cat').filter(lambda x: (len(x) > filtr) & (x.name not in 'news article storyline bigstory'.split()))
df = pd.pivot_table(df,values=['sp','rt'],index=['cat'])
ax= sns.regplot('rt','sp',df)
#ax.set_ylim(-1,2)
ax.set(xlabel='Retweet', ylabel='Sentiment Polarity', title='SP vs RT of Categories',ylim=(-1,2))
df.apply(lambda r: ax.annotate(r.name, r.values,xytext=(5,-2), textcoords='offset points'), axis=1);
plt.gcf().savefig('figs/cat-rt-sp-scatter.png')


# In[8]:


#mean, median, max of retweets per outlet
pd.pivot_table(tn,index=['outlet'],values=['rt'],aggfunc=[np.mean,np.median,max]).rename(columns={'rt':'Retweet'})


# In[9]:


# density plot of retweets
ax = [sns.kdeplot(tn[tn.outlet==o].rt,label=o) for o in outlets][0]
ax.set(xlim=(0,500),title='Retweet Distributions',xlabel='Retweet Count',ylabel='Density');


# In[10]:


# sentiment polarities [posemo - negemo] of news 
c = 'sp'
sns.kdeplot(pn[c],label='Published News')
sns.kdeplot(tn[c],label='Tweeted News')
ax = sns.kdeplot(rn[c],label='Retweeted News')
ax.set_title('Comparing Sentiment Polarity of News Published/Tweeted/Retweeted - All Outlets Combined')
ax.set(xlim=(-10,10),xlabel='Sentiment Polarity',ylabel='Density')
plt.savefig('figs/sentiment-comparison-published-tweeted-retweeted.png',bbox_inches='tight')


# In[11]:


sns.kdeplot(pn['Tone'],label='Published News')#.set(xlim=splim)
sns.kdeplot(tn['Tone'],label='Tweeted News')#.set(xlim=splim)
ax = sns.kdeplot(rn['Tone'],label='Retweeted News')
ax.set_title('Comparing Tone of News Published/Tweeted/Retweeted - All Outlets Combined')
ax.set_xlabel('Tone')
ax.set_ylabel('Density')
plt.savefig('figs/tone-comparison-published-tweeted-retweeted.png',bbox_inches='tight')


# In[12]:


df = pn
ax = [sns.kdeplot(df[df['outlet']==o]['sp'],label=o) for o in outlets][0]
ax.set(xlim=(-10,10)) # = sns.kdeplot(df['sp'],label='Avg',color='black')
ax.set_title('Sentiment Polarities of Published News')
ax.set_xlabel('Sentiment Polarity')
ax.set_ylabel('Density')
plt.savefig('figs/sentiment-published.png',bbox_inches='tight')


# In[13]:


df = tn
ax = [sns.kdeplot(df[df['outlet']==o]['sp'],label=o) for o in outlets][0]
ax.set(xlim=(-10,10)) # = sns.kdeplot(df['sp'],label='Avg',color='black')
ax.set_title('Sentiment Polarities of Tweeted News')
ax.set_xlabel('Sentiment Polarity')
ax.set_ylabel('Density')
plt.savefig('figs/sentiment-tweeted.png',bbox_inches='tight')


# In[14]:


df = rn
ax = [sns.kdeplot(df[df['outlet']==o]['sp'],label=o) for o in outlets][0]
ax.set(xlim=(-10,10)) # = sns.kdeplot(df['sp'],label='Avg',color='black')
ax.set_title('Sentiment Polarities of Retweeted News')
ax.set_xlabel('Sentiment Polarity')
ax.set_ylabel('Density')
plt.savefig('figs/sentiment-retweeted.png',bbox_inches='tight')


# In[15]:


# Each outlet in a separate plot
f,axes = plt.subplots(nrows=4,ncols=2,figsize=(16,16),subplot_kw={'xlim': (-10,10)});#,sharex=True,sharey=True
#f.tight_layout(h_pad=4,rect=(0,0,1,0.97))
plt.subplots_adjust(hspace=0.3,wspace=0.1)
for i,o in enumerate(outlets):
    ax = axes[i/2][i%2]
    ax.set_title(o,fontsize=14)
    ax.set_xlabel('Sentiment Polarity')
    ax.set_ylabel('Density')
    sns.kdeplot(pn[pn['outlet']==o]['sp'],label='all news',ax=ax)
    sns.kdeplot(tn[tn['outlet']==o]['sp'],label='tweeted',ax=ax)
    sns.kdeplot(rn[rn['outlet']==o]['sp'],label='retweeted',ax=ax)
f.suptitle('Sentiment Scores per Outlet', fontsize=20,y=0.93)
plt.savefig('figs/sentiment-per-outlet.png',bbox_inches='tight')


# In[16]:


df = pd.pivot_table(pn,index=['outlet'],values=['sp'],aggfunc=np.mean).rename(columns={'sp':'Published'})
df = df.join(pd.pivot_table(tn,index=['outlet'],values=['sp'],aggfunc=np.mean).rename(columns={'sp':'Tweeted'}))
df = df.join(pd.pivot_table(rn,index=['outlet'],values=['sp'],aggfunc=np.mean).rename(columns={'sp':'Retweeted'}))
df


# In[17]:


ax = df.plot()
ax.set_title('Sentiment Polarities of News Averaged per Outlet')
ax.set_ylabel('Sentiment Polarity')
ax.set_xlabel('Outlets')
plt.savefig('figs/sentiment-averages-per-outlet.png',bbox_inches='tight')


# In[32]:


cs = ['politics','sports']#,'world'
ax = [sns.kdeplot(tn[tn['cat']==c]['sp'],label=c+' tweeted') for c in cs][0]
[sns.kdeplot(rn[rn['cat']==c]['sp'],label=c+' retweeted',linestyle='--',color=sns.color_palette()[i]) for i,c in enumerate(cs)]
ax.set(xlim=(-7,7),title='Sentiment Distrubition of Politics and Sports News',xlabel='Sentiment Polarity',ylabel='Density')
fname= '-'.join(cs)
plt.savefig('figs/tweeted-'+fname+'.png',bbox_inches='tight')


# In[19]:


outlets.remove('AP')
outlets.remove('CBSNews')


# In[20]:


splim=(-10, 10)
sns.mpl.rc("figure", figsize=(10,4))
c='politics'
f,axes = plt.subplots(nrows=3,ncols=2,figsize=(16,12),subplot_kw={'xlim': splim});#,sharex=True,sharey=True
plt.subplots_adjust(hspace=0.3,wspace=0.1)
for i,o in enumerate(outlets):
    ax = axes[i/2][i%2]
    ax.set_title(o,fontsize=14)
    ax.set_xlabel('Sentiment Polarity')
    ax.set_ylabel('Density')
    tw = tn[(tn['cat']==c) & (tn['outlet']==o)]['sp']
    rt = rn[(rn['cat']==c) & (rn['outlet']==o)]['sp']
    sns.kdeplot(tw,label=c+' tweeted',ax=ax)
    sns.kdeplot(rt,label=c+' retweeted',ax=ax)
f.suptitle('Sentiment Scores for "Politics" per Outlet', fontsize=20,y=0.94)
plt.savefig('figs/politics-per-outlet.png',bbox_inches='tight')


# In[21]:


splim=(-10, 10)
sns.mpl.rc("figure", figsize=(10,4))
c='sports'
f,axes = plt.subplots(nrows=3,ncols=2,figsize=(16,12),subplot_kw={'xlim': splim});#,sharex=True,sharey=True
plt.subplots_adjust(hspace=0.3,wspace=0.1)
for i,o in enumerate(outlets):
    ax = axes[i/2][i%2]
    ax.set_title(o,fontsize=14)
    if o == 'CNN':
        ax.text(0.5, 0.5, 'Category N/A',
        horizontalalignment='center',
        verticalalignment='center',
        fontsize=16, color='red',
        transform=ax.transAxes)
        continue
    ax.set_xlabel('Sentiment Polarity')
    ax.set_ylabel('Density')
    tw = tn[(tn['cat']==c) & (tn['outlet']==o)]['sp']
    rt = rn[(rn['cat']==c) & (rn['outlet']==o)]['sp']
    sns.kdeplot(tw,label=c+' tweeted',ax=ax)
    sns.kdeplot(rt,label=c+' retweeted',ax=ax)
f.suptitle('Sentiment Scores for "Sports" per Outlet', fontsize=20,y=0.94)
plt.savefig('figs/sports-per-outlet.png',bbox_inches='tight')


# In[22]:


c = 'politics'
df = pd.pivot_table(tn[tn['cat']==c],index=['outlet'],values=['sp'],aggfunc=np.mean).rename(columns={'sp':'Tweeted'})
df = df.join(pd.pivot_table(rn[rn['cat']==c],index=['outlet'],values=['sp'],aggfunc=np.mean).rename(columns={'sp':'Retweeted'}))
df


# In[23]:


c = 'sports'
df = pd.pivot_table(tn[tn['cat']==c],index=['outlet'],values=['sp'],aggfunc=np.mean).rename(columns={'sp':'Tweeted'})
df = df.join(pd.pivot_table(rn[rn['cat']==c],index=['outlet'],values=['sp'],aggfunc=np.mean).rename(columns={'sp':'Retweeted'}))
df


# In[24]:


#Any correlation between polarity of the tweet text and retweeting?
tn[['rt','sp','sp_t']].corr()