In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '%.2f' % x)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
plt.rcParams['savefig.dpi'] = 150
plt.style.use('ggplot')

import seaborn as sns
sns.set(color_codes=True)
sns.set_palette(sns.color_palette("husl", 8));
sns.mpl.rc("figure", figsize=(8,5))
sns.set_style('whitegrid')
In [2]:
cd ../
C:\Users\Talha\Documents\WinPython3\projects\News-Sharing-by-Sentiment
In [3]:
#read the data -> pn: published news, tn: tweeted news
pn = pd.read_csv('data/pb-sp.csv',encoding='utf-8')
tn = pd.read_csv('data/tw-sp.csv',encoding='utf-8')
outlets = sorted(pn['outlet'].unique().tolist())
pn['sp'] = pn['posemo']-pn['negemo']
tn['sp'] = tn['posemo']-tn['negemo']
tn['sp_t'] = tn['posemo_t']-tn['negemo_t']
In [5]:
#sports
tn[tn.cat=='sports'][['rt','posemo','negemo']].corr()
Out[5]:
rt posemo negemo
rt 1.00 0.14 -0.04
posemo 0.14 1.00 -0.09
negemo -0.04 -0.09 1.00
In [13]:
tn[tn.cat.isin(['politics','middleeast'])][['rt','posemo','negemo']].corr()
Out[13]:
rt posemo negemo
rt 1.00 -0.05 0.05
posemo -0.05 1.00 0.03
negemo 0.05 0.03 1.00
In [4]:
#news tweeted multiple types are grouped into two
df4 = tn.groupby('href').filter(lambda g: len(g)>3) #news tweeted at least four times
more_rt = df4.groupby('href').apply(lambda g: g.sort('rt')[['rt','sp_t']].iloc[1:]) # all the tweet excluding the least RTed
less_rt = df4.groupby('href').apply(lambda g: g.sort('rt')[['rt','sp_t']].iloc[0]) #least retweeted tweets
print(more_rt.sp_t.mean(),less_rt.sp_t.mean())
-0.335782241015 -0.08625
In [5]:
df = tn.groupby('href').filter(lambda g: len(g)==3) #news tweeted exactly three times
grouped = df.groupby('href').apply(lambda g: g.sort('rt').reset_index())[['rt','sp_t','cat']]
unstacked = grouped.unstack()
ax = unstacked.sp_t.mean()[:4].plot(kind='bar')
ax.set(xlabel='Retweet order (0 is the least retweeted group)',ylabel='Mean Tweet Sentiment',title='Tweet Sentiment vs RT Groups');
In [6]:
#Sample size for each group
#pd.DataFrame([unstacked.sp_t[[i]].dropna().shape[0] for i in range(unstacked.sp_t.columns.shape[0])],columns=['N'])
unstacked
Out[6]:
rt sp_t cat
0 1 2 0 1 2 0 1 2
href
http://abcnews.go.com/Business/cup-inventor-john-sylvan-admits-expensive-coffee-pods/story?id=29382481 94.00 189.00 326.00 4.35 -3.84 -3.84 business business business
http://abcnews.go.com/Business/embattled-sony-pictures-executive-amy-pascal-steps/story?id=28749965 33.00 57.00 74.00 0.00 0.00 0.00 business business business
http://abcnews.go.com/Business/kraft-krft-heinz-agree-merge/story?id=29889951 123.00 166.00 208.00 4.55 8.00 7.69 business business business
http://abcnews.go.com/Business/make-104-acre-florida-island-dream-home/story?id=29732824 168.00 181.00 232.00 0.00 0.00 0.00 business business business
http://abcnews.go.com/Entertainment/bobbi-kristina-brown-alive-found-unresponsive/story?id=28628653 294.00 316.00 714.00 0.00 0.00 0.00 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/duchess-kate-appeals-greater-care-childrens-mental-health/story?id=28988190 210.00 351.00 369.00 0.00 0.00 3.85 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/harrison-ford-injured-california-small-plane-crash/story?id=29425681 186.00 478.00 746.00 0.00 0.00 0.00 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/jon-stewart-leaving-comedy-central/story?id=28875084 118.00 179.00 189.00 5.00 0.00 3.45 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/kevin-costner-god-miracle-bobbi-kristina-brown/story?id=28867469 133.00 155.00 217.00 7.69 7.69 7.69 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/meet-elena-avalor-disneys-latina-princess/story?id=28581447 292.00 810.00 1300.00 0.00 0.00 0.00 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/new-york-times-media-columnist-david-carr-dies-58/story?id=28936813 92.00 96.00 103.00 0.00 0.00 0.00 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/oscars-2015-live-updates-red-carpet/story?id=29075436 144.00 159.00 178.00 0.00 4.55 0.00 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/star-trek-star-leonard-nimoy-dies-83/story?id=29274628 732.00 835.00 2022.00 4.17 0.00 0.00 entertainment entertainment entertainment
http://abcnews.go.com/Entertainment/vanilla-ice-arrested-burglary-florida-police/story?id=29058510 71.00 146.00 156.00 0.00 0.00 0.00 entertainment entertainment entertainment
http://abcnews.go.com/Health/autism-speaks-urges-parents-vaccinate-children/story?id=28751485 428.00 544.00 565.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/Health/blind-golden-retriever-smiley-warms-hearts-therapy-dog/story?id=29533746 311.00 441.00 653.00 7.69 14.29 7.69 health health health
http://abcnews.go.com/Health/breakdown-ingredients-childhood-vaccines/story?id=28859870 68.00 71.00 102.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/Health/dads-heartfelt-plea-congress-year-leukemia-exposed-measles/story?id=28866376 189.00 207.00 331.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/Health/dangerous-bacteria-mysteriously-escapes-louisiana-monkey-lab/story?id=29327907 98.00 100.00 129.00 -6.67 -7.14 -7.14 health health health
http://abcnews.go.com/Health/doctors-crawling-finish-line-great-idea/story?id=28998255 342.00 468.00 470.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/Health/father-psych-ward-stabbing-victim-mental-patients-treated/story?id=28559283 38.00 42.00 56.00 -5.88 0.00 0.00 health health health
http://abcnews.go.com/Health/florida-woman-birth-141-pound-baby/story?id=28784382 195.00 196.00 241.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/Health/renowned-neurologist-oliver-sacks-announces-terminal-cancer/story?id=29084210 77.00 134.00 140.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/Health/sex-couples-day-biological-children-researchers/story?id=29220568 168.00 169.00 224.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/Health/year-girl-dies-catching-flu-vaccine/story?id=28526729 169.00 221.00 266.00 0.00 0.00 0.00 health health health
http://abcnews.go.com/International/International/tiny-penguins-tiny-sweaters/story?id=28886035 1384.00 2712.00 3784.00 0.00 0.00 0.00 world world world
http://abcnews.go.com/International/air-canada-hard-landing-passengers-lucky-officials/story?id=29984179 58.00 59.00 66.00 12.50 -6.67 0.00 world world world
http://abcnews.go.com/International/american-hostages-mother-us-failed-children/story?id=28803264 94.00 109.00 565.00 4.76 4.76 -4.00 world world world
http://abcnews.go.com/International/american-soldier-christ-fighting-isis-iraq/story?id=29171878 51.00 57.00 263.00 0.00 -3.57 -3.57 world world world
http://abcnews.go.com/International/americans-germanwings-plane-official/story?id=29887148 17.00 32.00 89.00 0.00 -7.14 0.00 world world world
... ... ... ... ... ... ... ... ... ...
http://www.washingtonpost.com/news/the-intersect/wp/2015/03/03/the-disturbing-case-of-the-bloggers-who-fake-death-and-disease-for-attention/ 59.00 80.00 90.00 -11.11 -11.11 -11.11 news news news
http://www.washingtonpost.com/news/the-intersect/wp/2015/03/09/facebook-censored-a-nude-painting-and-it-could-change-the-site-forever/ 68.00 117.00 133.00 0.00 0.00 0.00 news news news
http://www.washingtonpost.com/news/the-intersect/wp/2015/03/13/what-was-fake-on-the-internet-this-week-putins-death-hillarys-horns-and-marijuana-at-kfc/ 47.00 54.00 113.00 -4.76 -7.69 -4.76 news news news
http://www.washingtonpost.com/news/the-intersect/wp/2015/03/20/what-was-fake-on-the-internet-this-week-smartwatch-cancer-michael-browns-mom-and-the-true-story-of-unfriended/ 26.00 56.00 69.00 0.00 -7.69 -7.69 news news news
http://www.washingtonpost.com/news/the-intersect/wp/2015/03/23/what-you-dont-know-about-internet-algorithms-is-hurting-you-and-you-probably-dont-know-very-much/ 148.00 181.00 183.00 4.17 4.35 4.35 news news news
http://www.washingtonpost.com/news/the-intersect/wp/2015/04/01/what-is-fake-on-the-internet-today-a-comprehensive-updating-list-of-april-fools-pranks-and-hoaxes/ 77.00 87.00 126.00 -5.88 -4.17 -4.55 news news news
http://www.washingtonpost.com/news/to-your-health/wp/2015/03/09/how-parents-create-narcissistic-children/ 114.00 137.00 146.00 10.00 10.00 10.00 news news news
http://www.washingtonpost.com/opinions/2015/03/27/87655262-d3f4-11e4-a62f-ee745911a4ff_story.html 47.00 54.00 76.00 -5.00 -5.00 -5.00 opinions opinions opinions
http://www.washingtonpost.com/opinions/for-richer-or-poorer-the-challenges-of-marrying-outside-your-class/2015/03/26/cd7ccf72-ccac-11e4-8a46-b1dc9be5a8ff_story.html 50.00 83.00 204.00 6.25 6.25 4.54 opinions opinions opinions
http://www.washingtonpost.com/opinions/pro-discrimination-religious-freedom-laws-are-dangerous-to-america/2015/03/29/bdb4ce9e-d66d-11e4-ba28-f2a685dc7f89_story.html 217.00 390.00 435.00 0.00 0.00 0.00 opinions opinions opinions
http://www.washingtonpost.com/opinions/who-had-the-worst-week-in-washington-rep-aaron-schock/2015/03/20/66809852-ce6f-11e4-a2a7-9517a3a70506_story.html 21.00 34.00 35.00 -6.25 -6.67 -6.67 opinions opinions opinions
http://www.washingtonpost.com/politics/absence-of-2016-competition-for-clinton-raises-stakes-for-democrats/2015/03/11/60fc4ca8-c81d-11e4-a199-6cb5e63819d2_story.html 26.00 36.00 43.00 5.00 5.88 5.26 politics politics politics
http://www.washingtonpost.com/politics/hillary-clinton-to-answer-questions-about-use-of-private-e-mail-server/2015/03/10/4c000d00-c735-11e4-a199-6cb5e63819d2_story.html 33.00 43.00 131.00 0.00 0.00 4.00 politics politics politics
http://www.washingtonpost.com/politics/how-the-white-house-decides-whose-death-is-worth-presidential-notice/2015/03/12/0c43083a-c83d-11e4-a199-6cb5e63819d2_story.html 47.00 51.00 65.00 0.00 0.00 0.00 politics politics politics
http://www.washingtonpost.com/politics/mitt-romney-warms-to-marco-rubio-as-young-senator-cultivates-relationship/2015/03/13/21a769b8-c98d-11e4-a199-6cb5e63819d2_story.html 26.00 49.00 60.00 5.00 0.00 0.00 politics politics politics
http://www.washingtonpost.com/politics/police-suspect-arrested-in-shooting-of-two-officers-in-ferguson/2015/03/15/eb3140c2-cb38-11e4-8a46-b1dc9be5a8ff_story.html 37.00 41.00 197.00 0.00 0.00 -4.00 politics politics politics
http://www.washingtonpost.com/politics/secret-service-agents-disrupted-bomb-investigation-at-white-house/2015/03/12/0eb74590-c8c4-11e4-aa1a-86135599fb0f_story.html 69.00 88.00 157.00 0.00 4.76 0.00 politics politics politics
http://www.washingtonpost.com/politics/secret-service-agents-investigated-for-late-night-car-accident-at-white-house/2015/03/11/9c853906-c7ff-11e4-a199-6cb5e63819d2_story.html 58.00 66.00 85.00 0.00 0.00 0.00 politics politics politics
http://www.washingtonpost.com/politics/state-department-reviewing-whether-clinton-e-mail-violated-security-rules/2015/03/05/16d1547e-c378-11e4-9271-610273846239_story.html 43.00 43.00 65.00 0.00 0.00 0.00 politics politics politics
http://www.washingtonpost.com/posteverything/wp/2015/03/14/this-is-why-its-impossible-for-the-kremlin-to-lie-about-putins-weird-disappearance/ 125.00 126.00 130.00 -5.26 -5.26 -5.26 posteverything posteverything posteverything
http://www.washingtonpost.com/posteverything/wp/2015/03/30/youre-not-fooling-everyone-with-your-pretend-laughter/ 24.00 26.00 43.00 0.00 0.00 0.00 posteverything posteverything posteverything
http://www.washingtonpost.com/world/africa/deep-in-the-rain-forest-hunting-for-the-next-ebola-outbreak/2015/03/19/c1cba80e-b78c-11e4-bc30-a4e75503948a_story.html 57.00 63.00 108.00 4.35 0.00 4.35 world world world
http://www.washingtonpost.com/world/after-12-years-in-guantanamo-ex-detainees-find-little-solace-in-uruguay/2015/03/21/4d376006-c1e5-11e4-a188-8e4971d37a8d_story.html 43.00 68.00 108.00 5.00 0.00 0.00 world world world
http://www.washingtonpost.com/world/asia_pacific/north-koreas-growing-economy-and-americas-misconceptions-about-it/2015/03/13/b551d2d0-c1a8-11e4-a188-8e4971d37a8d_story.html 55.00 78.00 87.00 0.00 0.00 0.00 world world world
http://www.washingtonpost.com/world/europe/come-to-rome-for-the-cathedrals-the-ruins--and-the-red-light-district/2015/03/09/880d0440-bd37-11e4-9dfb-03366e719af8_story.html 37.00 38.00 52.00 4.76 -5.56 4.76 world world world
http://www.washingtonpost.com/world/europe/report-co-pilot-on-doomed-flight-had-psychological-treatments-in-past/2015/03/27/b1818c48-d40b-11e4-8b1e-274d670aa9c9_story.html 51.00 67.00 76.00 -4.00 0.00 -4.35 world world world
http://www.washingtonpost.com/world/middle_east/the-islamic-state-is-fraying-from-within/2015/03/08/0003a2e0-c276-11e4-a188-8e4971d37a8d_story.html 24.00 54.00 131.00 -11.76 0.00 0.00 world world world
http://www.washingtonpost.com/world/negotiators-hold-marathon-all-night-session-in-last-ditch-effort-for-agreement/2015/04/02/68334c88-d8b2-11e4-bf0b-f648b95a6488_story.html 51.00 68.00 102.00 6.25 0.00 5.00 world world world
http://www.washingtonpost.com/world/pilot-reportedly-locked-out-of-cockpit-before-plane-crashed-into-alpine-mountainside/2015/03/26/460770d8-d38c-11e4-a62f-ee745911a4ff_story.html 78.00 104.00 255.00 0.00 -5.56 -4.35 world world world
http://www.washingtonpost.com/world/plane-carrying-150-crashes-in-france-apparently-no-survivors/2015/03/24/6fe0fc70-d225-11e4-a62f-ee745911a4ff_story.html 103.00 105.00 253.00 0.00 -4.35 0.00 world world world

529 rows × 9 columns

In [4]:
df4 = tn.groupby('href').filter(lambda g: len(g)>3)
gs = gridspec.GridSpec(3, 3)
axs = [plt.subplot(s) for s in (gs[0,0],gs[0,1],gs[0,2],gs[1,0],gs[1,1],gs[1,2],gs[2,:2],gs[2,2])]
for i,o in enumerate(df4.outlet.unique()):
    axs[i].set_title(o)
    if o == 'CNN' or o =='ABC': axs[i].set_ylim([0,3000])
    if o == 'NYT': axs[i].set_ylim([0,2000])
    if o == 'WPOST': axs[i].set_ylim([0,1000])
    g = sns.stripplot(x="href", y="rt",data=df4[df4.outlet==o],ax=axs[i],jitter=True,size=4)
    ylabel = 'Retweet' if i%3 == 0 else ''
    xlabel = 'Tweeted News' if i>=6 else ''
    axs[i].set(xlabel=xlabel,ylabel=ylabel,ylim=0,xticks=[])

f = plt.gcf()
f.set_size_inches(10, 7, forward=True)
f.suptitle('Retweet Counts of The News Tweeted at Least Four Times',fontsize=18, fontweight='bold')
f.savefig('figs/rt-news-stripplot.png', bbox_inches='tight')
In [5]:
#retweet stats
tw = pd.read_csv('data/LIWC/LIWC2015 Results (tweet-texts).csv',encoding='utf-8')
nan = tw[tw.url.isnull()]
url = tw[~tw.url.isnull()] 
pd.DataFrame({'no url':nan.rt.describe(),'any url':url.rt.describe(),'news url':tn.rt.describe()})
Out[5]:
any url news url no url
count 23255.00 16909.00 2344.00
mean 135.47 134.34 206.62
std 355.89 259.36 759.46
min 0.00 1.00 0.00
25% 43.00 45.00 44.00
50% 72.00 74.00 94.00
75% 133.00 134.00 207.00
max 36985.00 11031.00 31123.00
In [14]:
# Retweeted News (NOTE: THIS TAKES ~ 3min 32s)
rn = pd.concat([pd.DataFrame([row[1]]*int(1 + row[1].rt/50)) for row in tn.iterrows()])
In [6]:
#sentiment polarity stats
pd.DataFrame({'Published':pn.sp.describe(),'Tweeted':tn.sp.describe(),'Retweeted':rn.sp.describe()})
Out[6]:
Published Retweeted Tweeted
count 35930.000000 53792.000000 16909.000000
mean 0.514790 0.201119 0.124686
std 1.793846 1.977284 1.914987
min -10.730000 -8.650000 -8.650000
25% -0.510000 -0.990000 -1.050000
50% 0.550000 0.180000 0.090000
75% 1.630000 1.360000 1.260000
max 11.320000 14.700000 14.700000
In [7]:
# we can filter the categories by increasing the number of news required per category
filtr = 100
df = tn.groupby('cat').filter(lambda x: (len(x) > filtr) & (x.name not in 'news article storyline bigstory'.split()))
df = pd.pivot_table(df,values=['sp','rt'],index=['cat'])
ax= sns.regplot('rt','sp',df)
#ax.set_ylim(-1,2)
ax.set(xlabel='Retweet', ylabel='Sentiment Polarity', title='SP vs RT of Categories',ylim=(-1,2))
df.apply(lambda r: ax.annotate(r.name, r.values,xytext=(5,-2), textcoords='offset points'), axis=1);
plt.gcf().savefig('figs/cat-rt-sp-scatter.png')
In [8]:
#mean, median, max of retweets per outlet
pd.pivot_table(tn,index=['outlet'],values=['rt'],aggfunc=[np.mean,np.median,max]).rename(columns={'rt':'Retweet'})
Out[8]:
mean median max
Retweet Retweet Retweet
outlet
ABC 184.100213 110.5 6994
AP 89.745522 70.0 862
CBSNews 72.585079 48.0 2458
CNN 396.500725 248.0 7752
FoxNews 134.664111 89.0 3122
NBCNews 85.786790 55.0 11031
NYT 139.529051 84.0 8917
WPOST 83.048544 59.0 3683
In [9]:
# density plot of retweets
ax = [sns.kdeplot(tn[tn.outlet==o].rt,label=o) for o in outlets][0]
ax.set(xlim=(0,500),title='Retweet Distributions',xlabel='Retweet Count',ylabel='Density');
In [10]:
# sentiment polarities [posemo - negemo] of news 
c = 'sp'
sns.kdeplot(pn[c],label='Published News')
sns.kdeplot(tn[c],label='Tweeted News')
ax = sns.kdeplot(rn[c],label='Retweeted News')
ax.set_title('Comparing Sentiment Polarity of News Published/Tweeted/Retweeted - All Outlets Combined')
ax.set(xlim=(-10,10),xlabel='Sentiment Polarity',ylabel='Density')
plt.savefig('figs/sentiment-comparison-published-tweeted-retweeted.png',bbox_inches='tight')
In [11]:
sns.kdeplot(pn['Tone'],label='Published News')#.set(xlim=splim)
sns.kdeplot(tn['Tone'],label='Tweeted News')#.set(xlim=splim)
ax = sns.kdeplot(rn['Tone'],label='Retweeted News')
ax.set_title('Comparing Tone of News Published/Tweeted/Retweeted - All Outlets Combined')
ax.set_xlabel('Tone')
ax.set_ylabel('Density')
plt.savefig('figs/tone-comparison-published-tweeted-retweeted.png',bbox_inches='tight')
In [12]:
df = pn
ax = [sns.kdeplot(df[df['outlet']==o]['sp'],label=o) for o in outlets][0]
ax.set(xlim=(-10,10)) # = sns.kdeplot(df['sp'],label='Avg',color='black')
ax.set_title('Sentiment Polarities of Published News')
ax.set_xlabel('Sentiment Polarity')
ax.set_ylabel('Density')
plt.savefig('figs/sentiment-published.png',bbox_inches='tight')