#!/usr/bin/env python # coding: utf-8 # ## Attribution of Responsibility and Blame Regarding a Man-made Disaster: #FlintWaterCrisis # # by [Talha Oz](http://talhaoz.com) and [Halil Bisgin](http://halilbisgin.com) # # *Presented this work in the 4th International Workshop on Social Web for Disaster Management (SWDM'16), co-located with CIKM 2016; here is the [paper](https://arxiv.org/abs/1610.03480).* # # **Abstract** # Attribution of responsibility and blame are important topics in political science especially as individuals tend to think of political issues in terms of questions of responsibility, and as blame carries far more weight in voting behavior than that of credit. However, surprisingly, there is a paucity of studies on the attribution of responsibility and blame in the field of disaster research. # # The Flint water crisis is a story of government failure at all levels. By studying microblog posts about it, we understand how citizens assign responsibility and blame regarding such a man-made disaster online. We form hypotheses based on social scientific theories in disaster research and then operationalize them on unobtrusive, observational social media data. In particular, we investigate the following phenomena: the source for blame; the partisan predisposition; the concerned geographies; and the contagion of complaining. # # This paper adds to the sociology of disasters research by exploiting a new, rarely used data source (the social web), and by employing new computational methods (such as sentiment analysis and retrospective cohort study design) on this new form of data. In this regard, this work should be seen as the first step toward drawing more challenging inferences on the sociology of disasters from "big social data". # In[ ]: # read the JSON data and save it to Flint.pkl once, # whenever want to read the data, read the pickle, # instead of the raw JSON files. # This code block is here just to show how we created the pickle (.pkl) file. import pandas as pd import json from glob import glob from datetime import datetime tw = [] for f in glob("data/TweetCollection/*.json"): with open(f, 'r',encoding='utf-8') as fin: for line in fin: a = json.loads(line) tw.append({'id':a['id_str'], 'created_at':datetime.strptime(a['created_at'],'%a %b %d %H:%M:%S +0000 %Y'), 'hashtagged':any(['flintwatercrisis' in h['text'].lower() for h in a['entities']['hashtags']]), 'screen_name':a['user']['screen_name'], 'location':a['user']['location'], 'followers':a['user']['followers_count'], 'verified':bool(a['user']['verified']), 'text':a['text']}) df = pd.DataFrame(tw).set_index('id').drop_duplicates() #df.to_pickle('data/Flint.pkl') # In[1]: import pandas as pd import numpy as np pd.set_option('max_colwidth',200) df = pd.read_pickle('../data/Flint.pkl') from utilities.geocoder import Geocoder gc = Geocoder('utilities/geodata/state_abbr_file', 'utilities/geodata/city_file') df['latlon'] = df.location.str.strip().apply(gc.geocode) from IPython.display import HTML HTML(df.head().to_html(index=False)) #how the data looks like # In[2]: g = df.groupby('text').size().reset_index() g.columns = ['text','cnt'] g = g.sort_values('cnt',ascending=False) print('total tw:',len(df),'\nunique tw:',len(g)) g.head() #most popular tweets # In[15]: # the original dates are in UTC/GMT, convert them to EST. # also, as given in footnote #4, report the missing dates import pytz eastern = pytz.timezone('US/Eastern') # group tweets by day df.created_at = df.created_at.dt.tz_localize(pytz.utc).dt.tz_convert(eastern) # print missing date intervals in our dataset day = df.groupby(df.created_at.dt.strftime('%m-%d'))['created_at'].count() days = day.index.tolist() for i in range(len(days)-1): m1,d1 = days[i].split('-') m2,d2 = days[i+1].split('-') if m1 == m2: if int(d1) == int(d2) - 1: continue else: if d2 == '01': continue print('('+days[i]+','+days[i+1]+')',end=' ') # In[210]: #Figure 1 import matplotlib.pyplot as plt import matplotlib get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns matplotlib.style.use('fivethirtyeight') matplotlib.style.use('ggplot') plt.rcParams['axes.facecolor']='w' plt.rcParams['savefig.facecolor']='w' matplotlib.rcParams['font.size'] = 14 #plot daily activity ax = day.plot(kind="bar",figsize=(18, 4)) #,title='#FlintWaterCrisis Activity on Twitter' #ax.set_xlabel('Days After Flint Became a Federal State of Emergency on 2016-01-16', fontsize=14) ax.set_ylabel('Tweets in the 1% sample', fontsize=14) for label in ax.xaxis.get_ticklabels()[::2]: label.set_visible(False) ax.annotate('Federal State of Emergency', xy=(0, 31000)) ax.annotate('Gov. Rick Snyder holds a news conf.\n'\ 'Groups file a federal lawsuit', xy=(12, 41000),ha='center') ax.annotate('First Flint hearing in Congress\n'\ 'Hillary visits Flint', xy=(22, 9000),ha='center') ax.annotate('GOP debate in Detroit,MI\nRubio defends MI governor', xy=(31, 23000),ha='center') ax.annotate('DEM debate in Flint,MI\nBoth candidates calls Snyder to resign', xy=(36, 45000),ha='center') ax.annotate('MI primaries\nfor both parties', xy=(40, 33000),ha='center') ax.annotate('Gov Snyder & EPA admin McCarthy\ntestify before Congress', xy=(47, 11000),ha='center') ax.annotate('A local\'s complaining tweet goes viral\nGov Snyder asks the lawsuit be dismissed', xy=(69, 9000),ha='center') ax.annotate('Obama visits Flint', xy=(88, 10000),ha='center') ax.set_xlim([-1, 93]) ax.set_xlabel(''); ax.get_figure().savefig('../figs/daily.pdf',dpi=150,bbox_inches='tight') # In[3]: #Figure 2 import matplotlib.pyplot as plt import matplotlib get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns #matplotlib.style.use('fivethirtyeight') matplotlib.style.use('ggplot') plt.rcParams['axes.facecolor']='w' plt.rcParams['savefig.facecolor']='w' matplotlib.rcParams['font.size'] = 14 l = pd.DataFrame() for i in range(5): r = pd.read_csv('../data/training/Flint'+str(i+1)+'_train.csv') r['rater'] = i l = l.append(r) l = l.fillna('missing') l['label'] = l.c.replace({',.*':'','missing':10},regex=True).astype(int) #removes multiple labels #get pairwise kappas from itertools import combinations from statsmodels.stats.inter_rater import fleiss_kappa from statsmodels.stats.inter_rater import aggregate_raters kappa = [] for r1,r2 in combinations(range(5), 2): rr = l[l.rater==r1].merge(l[l.rater==r2],on='text')[['label_x','label_y']] k = fleiss_kappa(aggregate_raters(rr,n_cat=11)[0]) kappa.append(('r'+str(r1),'r'+str(r2),k)) kappa.append(('r'+str(r2),'r'+str(r1),k)) #(r2,r1,k) a = pd.DataFrame(kappa).pivot(0,1,2) #pairwise inter-rater fleiss-kappa a.index.name = None a.columns.name = None plt.figure(num=None, figsize=(6, 4), facecolor='w', edgecolor='w') labels = ['No blame','MI Governor','POTUS','Flint Mayor', 'EPA','Emergency M.','Republicans','Democrats','Government','Other indiv.', 'Unsure'] cnt = [len(l[l.c.str.contains(str(i))]) for i in range(10)] #count of each label cnt.append(len(l[l.c.str.contains('missing')])) ax = plt.subplot() ax.margins(0, 0) colors = '#777777 #E24A33 #348ABD #348ABD #348ABD #E24A33 #E24A33 #348ABD #FBC15E #8EBA42 #FFB5B8'.split() #[color['color'] for color in list(plt.rcParams['axes.prop_cycle'])] ax.barh(range(len(cnt)),cnt,tick_label=labels,align='center',color=colors) #ax.set(xlabel='Manually coded tweets'); #title='Attribution of Blame/Responsibility', #ax.grid(color='grey', linestyle='dotted', linewidth=0.5) plt.axes([.4, .33, .55, .55]) sns.heatmap(a,annot=True,vmin=0,vmax=1,cmap='RdBu_r',annot_kws={'size':12}) #ax.get_figure().savefig('../figs/coders.pdf',dpi=150,bbox_inches='tight') # In[34]: #Table 1 df1 = pd.read_csv('../data/us-city-populations.csv',usecols=['CityST','2000','2010','LAT','LON','County_Name']) df2 = pd.read_csv('../data/city_file.csv',dtype={'lat':str,'lon':str}) df2['CityST'] = df2.city + ', ' + df2.state cities = df1.merge(df2, on = 'CityST', how = 'inner') cities['latlon'] = cities[['lat','lon']].apply(tuple, axis=1) cnt = pd.DataFrame(df.groupby(by='latlon').size().reset_index().rename(columns={0:'cnt'})) cities = cities.merge(cnt,on='latlon',how='inner').rename(columns={'2010':'cpop','County_Name':'county'}) cities = cities[cities.cnt>=3] cities.loc[cities.cpop.isnull(),'cpop'] = cities[cities.cpop.isnull()]['2000'] cities = cities.sort_values('cnt',ascending=False).reset_index().drop(['index','LAT','LON','2000'],1) cities.cpop = cities.cpop.astype(int) #cities.to_csv('data/cities.csv',index=False) cities.head(10) #tweet counts without normalization fil = cities[cities.cpop>88].copy() fil['normalized'] = fil.cnt * 1000 / fil.cpop fil = fil[fil.normalized>=1] fil.sort_values('normalized',ascending=False).head(10) #normalized city10 = fil.sort_values('normalized',ascending=False).head(10).reset_index() city10 = city10.rename(columns={'CityST':'Cities'}) cofil = fil.groupby(['county','state']).sum() cofil.normalized = cofil.cnt / np.sqrt(cofil.cpop) county10 = cofil.sort_values(by='normalized',ascending=False).head(10).reset_index() county10['Counties'] = county10.county +', '+county10.state cc = pd.concat([city10.Cities,county10.Counties],axis=1) cc.index += 1 print(cc.to_latex()) # In[4]: from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() snyder = df.text.str.contains('governor|nyder|onetoughnerd',case=False) EM = df.text.str.contains('mgr|manager|Darnell|Earley|Kurtz',case=False) mayor = df.text.str.contains('Dayne|Walling|ayor',case=False) print([len(df[x]) for x in [snyder,EM,mayor]]) #Footnote 10. # In[3]: from matplotlib import animation,font_manager import matplotlib.pyplot as plt from html import unescape import os plt.rcParams['savefig.dpi']=150 plt.rcParams['animation.html'] = 'html5' fig, ax = plt.subplots(figsize=(6, 1)) ax.set_axis_off() plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1) prop = font_manager.FontProperties(fname='Quivira.otf') # 'Symbola.ttf' text = ax.text(.5, .5, '', fontsize=11, va='center', ha='center', wrap=True, fontproperties = prop) txt = list(g.head(30).text) #g is a pandas dataframe def animate(i): text.set_text('('+str(i+1)+') '+unescape(txt[i])) return (text,) anim = animation.FuncAnimation(fig, animate, frames=len(txt), interval=2000, blit=True) anim.save('top30.mp4') #matplotlib can save as mp4, but not as gif yet. os.system("convert -delay 200 top30.mp4 top30.gif") #imagemagick's convert anim #eye candy for the presentation :-) # In[180]: # Figure 3 (new) c = pd.DataFrame() for i in range(5): r = pd.read_csv('../data/training/Flint'+str(i+1)+'_train.csv') r['rater'] = i c = c.append(r) c = c.dropna() print(len(c[c.c.str.contains('6')]),len(c[c.c.str.contains('7')])) r = df.screen_name[df.text.isin(c[c.c.str.contains('6')].text)] d = df.screen_name[df.text.isin(c[c.c.str.contains('7')].text)] m = mayoronly[mayoronly.cmpnd!=0] s = snyderonly[snyderonly.cmpnd!=0] #matplotlib.style.use('fivethirtyeight') matplotlib.style.use('ggplot') plt.rcParams['axes.facecolor']='w' plt.rcParams['savefig.facecolor']='w' matplotlib.rcParams['xtick.labelsize'] = 16 matplotlib.rcParams['ytick.labelsize'] = 16 matplotlib.rcParams['axes.titlesize'] = 18 co = {'color':'black'} ma = {'color':'black','linestyle':'-'} boxprops = dict(linestyle='-', color='black') f, ax = plt.subplots(1, 2, sharey=True,figsize=(8,3)) titles = ['Governor','Mayor'] for i,a in enumerate([s,m]): bp = ax[i].boxplot([a[a.screen_name.isin(d)].cmpnd,a[a.screen_name.isin(r)].cmpnd], patch_artist=True, whiskerprops=co,capprops=co,medianprops=ma,boxprops=boxprops,labels=['Blaming R','Blaming D']) for box, color in zip(bp['boxes'], ['#348ABD','#E24A33']): box.set_color('black') box.set_facecolor(color) ax[i].set_title(titles[i],y=.9) ax[i].yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax[0].set_ylabel('Sentiment score',fontsize=18); f.savefig('../figs/box-partisanship.pdf',dpi=150,bbox_inches='tight') # In[5]: snyderonly = df[snyder&~mayor&~EM].copy() mayoronly=df[mayor&~snyder&~EM].copy() a = pd.DataFrame(list(snyderonly.text.apply(sid.polarity_scores))) snyderonly = pd.concat([snyderonly.reset_index(),a.rename(columns={'compound':'cmpnd'})],axis=1) a = pd.DataFrame(list(mayoronly.text.apply(sid.polarity_scores))) mayoronly = pd.concat([mayoronly.reset_index(),a.rename(columns={'compound':'cmpnd'})],axis=1) # In[130]: from scipy.stats import ks_2samp from math import sqrt c_a = 1.95 #coefficient c_a is 1.36 for alpha 0.05 and 1.95 for alpha 0.001 for i,a in enumerate([s,m]): print(ks_2samp(a[a.screen_name.isin(d)].cmpnd,a[a.screen_name.isin(r)].cmpnd)) n1 = len(a[a.screen_name.isin(d)]) n2 = len(a[a.screen_name.isin(r)]) print('Critical value D_a:',c_a*sqrt((n1+n2)/(n1*n2))) # In[60]: #now the contagion "experiment" import matplotlib.pyplot as plt import matplotlib get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns #matplotlib.style.use('fivethirtyeight') matplotlib.style.use('ggplot') plt.rcParams['axes.facecolor']='w' plt.rcParams['savefig.facecolor']='w' plt.rcParams['savefig.dpi']=227 #DPI of my 13.3 MacBook Pro Retina f = df[df.latlon == gc.geocode("Flint, MI")].groupby('screen_name').size() f.sort_values(ascending=False).plot(ylim=(0,30),linestyle="None",marker='.',figsize=(10,5)) f2 = list(f[(f>2)&(f<20)].index.values) print(len(f2)) #Natural selection: Flinters who tweeted 20]),len(ffdf[ffdf.usent<0])) # In[152]: udf = {} for u in ffdf.index.values: usent = pd.DataFrame(list(df[df.screen_name == u].text.apply(sid.polarity_scores)))['compound'] udf[u] = {'utwcnt':len(usent),'usent':usent.mean()} usdf = pd.DataFrame.from_dict(udf,orient='index') usdf # In[158]: fdf = {} for k,v in friends.items(): fsk = fs[fs.screen_name.isin(v.screen_name)] fsent = fsk.cmpnd.mean() fdf[k] = {'ftwcnt':len(fsk),'dfcnt':len(fsk.screen_name.unique()),'tfcnt':len(v),'fsent':fsent} ffdf = pd.DataFrame.from_dict(fdf,orient='index') ffdf = ffdf.join(usdf) #ffdf.to_csv('../data/ffdf.csv') ffdf # In[207]: len(fr),df[df.screen_name.isin(fr)].screen_name.nunique() # In[205]: print(len(ffdf[ffdf.usent>0]),len(ffdf[ffdf.usent<0]),len(ffdf[ffdf.fsent>0]),len(ffdf[ffdf.fsent<0])) # In[167]: ffdf.corr(method='pearson') #.loc['fsent','usent'] = .16 # In[211]: ffdf[ffdf.usent>0].fsent.mean(),ffdf[ffdf.usent<0].fsent.mean() # In[228]: colorm = dict(boxes='lightgreen', whiskers='black', medians='black', caps='black') #ax=compare[['followers','population']].plot(kind='box', patch_artist=True, showfliers=False) boxprops = dict(linestyle='-', color='black') matplotlib.style.use('ggplot') plt.rcParams['axes.facecolor']='w' plt.rcParams['savefig.facecolor']='w' matplotlib.rcParams['xtick.labelsize'] = 20 matplotlib.rcParams['ytick.labelsize'] = 18 matplotlib.rcParams['axes.titlesize'] = 14 co = {'color':'black'} ma = {'color':'black','linestyle':'-'} plt.figure(figsize=(9,3)) cohort = ffdf[ffdf.usent<0].fsent #ffdf[ffdf.fsent<0].usent control= ffdf[ffdf.usent>0].fsent.dropna() #ffdf[ffdf.fsent>0].usent print(cohort.mean(),control.mean()) bp = plt.boxplot([cohort,control],patch_artist=True, showfliers=False, whiskerprops=co,capprops=co,medianprops=ma,boxprops=boxprops,labels=['Friends of the cohort','Friends of the control']) ax = plt.gca() for patch, color in zip(bp['boxes'], ['magenta','lightgreen']): patch.set_facecolor(color) ax.set_ylabel('Sentiment score',fontsize=22) ax.set_ylim(-.23,.08) #plt.yticks(np.arange(-.6, .6, .1)) ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax.get_figure().savefig('../figs/contagion-exp2.pdf', bbox_inches='tight') # In[193]: from scipy.stats import ks_2samp from math import sqrt c_a = 1.36 #coefficient c_a is 1.36 for alpha 0.05 and 1.95 for alpha 0.001 print(ks_2samp(ffdf[ffdf.fsent<0].usent,ffdf[ffdf.fsent>0].usent)) n1 = len(ffdf[ffdf.fsent<0]) n2 = len(ffdf[ffdf.fsent>0]) print('Critical value D_a (ks statistic (D) should be greater than this):',c_a*sqrt((n1+n2)/(n1*n2))) #that is the case for 95% confidence level: https://daithiocrualaoich.github.io/kolmogorov_smirnov/ # In[214]: from scipy.stats import ks_2samp from math import sqrt c_a = 1.36 #coefficient c_a is 1.36 for alpha 0.05 and 1.95 for alpha 0.001 print(ks_2samp(ffdf[ffdf.usent<0].fsent,ffdf[ffdf.usent>0].fsent)) n1 = len(ffdf[ffdf.usent<0]) n2 = len(ffdf[ffdf.usent>0]) print('Critical value D_a (ks statistic (D) should be greater than this):',c_a*sqrt((n1+n2)/(n1*n2))) #that is the case for 95% confidence level: https://daithiocrualaoich.github.io/kolmogorov_smirnov/ # In[66]: def get_friends(screen_name,twitter_api,limit=5000): get_followers_ids = partial(make_twitter_request,twitter_api.friends.ids, count=5000) ids = [] cursor = -1 while cursor != 0: response = get_followers_ids(screen_name=screen_name, cursor=cursor) if response is not None: ids += response['ids'] cursor = response['next_cursor'] # print('Fetched {0} total {1} ids for {2}. next_cursor: {3}'.format( # len(ids), label, screen_name, cursor)) if len(ids) >= limit or response is None: break return ids_to_snames(twitter_api,ids[:limit],screen_name=screen_name) def ids_to_snames(twitter_api,fids,screen_name='tozcss'): get_snames = partial(make_twitter_request,twitter_api.users.lookup) resp = [] for i in range(1+(len(fids)-1)//100): resp.extend(get_snames(user_id=fids[100*i:100*(i+1)])) header = ['id_str','screen_name', 'name', 'location', 'description', 'created_at', \ 'friends_count','followers_count','statuses_count','favourites_count'] df = pd.DataFrame.from_dict(resp)[header].set_index('id_str') return df def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): # A nested helper function that handles common HTTPErrors. Return an updated # value for wait_period if the problem is a 500 level error. Block until the # rate limit is reset if it's a rate limiting issue (429 error). Returns None # for 401 and 404 errors, which requires special handling by the caller. def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True): if wait_period > 3600: # Seconds print ('Too many retries. Quitting.',file=sys.stderr) raise e # See https://dev.twitter.com/docs/error-codes-responses for common codes if e.e.code == 401: print ('Encountered 401 Error (Not Authorized)',file=sys.stderr) return None elif e.e.code == 404: print ('Encountered 404 Error (Not Found)',file=sys.stderr) return None elif e.e.code == 429: print ('Encountered 429 Error (Rate Limit Exceeded)',file=sys.stderr) if sleep_when_rate_limited: print ("Retrying in 15 minutes...ZzZ...",file=sys.stderr) sys.stderr.flush() time.sleep(60*15 + 5) print ('...ZzZ...Awake now and trying again.',file=sys.stderr) return 2 else: raise e # Caller must handle the rate limiting issue elif e.e.code in (500, 502, 503, 504): print ('Encountered',e.e.code,'Error. Retrying in',wait_period,'seconds',file=sys.stderr) time.sleep(wait_period) wait_period *= 1.5 return wait_period else: raise e # End of nested helper function wait_period = 2 error_count = 0 while True: try: return twitter_api_func(*args, **kw) except t.api.TwitterHTTPError as e: error_count = 0 wait_period = handle_twitter_http_error(e, wait_period) if wait_period is None: return except URLError as e: error_count += 1 time.sleep(wait_period) wait_period *= 1.5 print ("URLError encountered. Continuing.",file = sys.stderr) if error_count > max_errors: print ("Too many consecutive errors...bailing out.",file=sys.stderr) raise except BadStatusLine as e: error_count += 1 time.sleep(wait_period) wait_period *= 1.5 print ("BadStatusLine encountered. Continuing.",file=sys.stderr) if error_count > max_errors: print ("Too many consecutive errors...bailing out.",file=sys.stderr) raise # In[56]: import subprocess #the table that went into the presentation template = r'''\documentclass[preview]{{standalone}} \usepackage{{booktabs}} \usepackage[vcentering,dvips]{{geometry}} \geometry{{total={{3.05in}}}} \begin{{document}} {} \end{{document}} ''' filename="../figs/concerned_geo.tex" with open(filename, 'w') as f: f.write(template.format(cc.to_latex())) subprocess.call(['pdflatex', filename],cwd=r'../figs'); # # The rest is not used in the paper # In[160]: # Figure 4 of version1 compare = pd.read_table('../data/popVSfollower1000.txt',header=0, sep="\t") colorm = dict(boxes='lightgreen', whiskers='black', medians='black', caps='black') #ax=compare[['followers','population']].plot(kind='box', patch_artist=True, showfliers=False) boxprops = dict(linestyle='-', color='black') matplotlib.style.use('ggplot') plt.rcParams['axes.facecolor']='w' plt.rcParams['savefig.facecolor']='w' matplotlib.rcParams['xtick.labelsize'] = 20 matplotlib.rcParams['ytick.labelsize'] = 18 matplotlib.rcParams['axes.titlesize'] = 14 co = {'color':'black'} ma = {'color':'black','linestyle':'-'} plt.figure(figsize=(9,3)) bp = plt.boxplot([compare.followers,compare.population],patch_artist=True, showfliers=False, whiskerprops=co,capprops=co,medianprops=ma,boxprops=boxprops,labels=['mayor','governor']) ax = plt.gca() for patch, color in zip(bp['boxes'], ['magenta','lightgreen']): patch.set_facecolor(color) ax.xaxis.set_ticklabels(['Cohort','Control']) ax.set_ylabel('Sentiment score',fontsize=22) #ax.set_ylim([-.23, -.12]) ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax.get_figure().savefig('../figs/contagion-exp.pdf', bbox_inches='tight') # In[214]: # Figure 3 mayonly_avgsent=pd.DataFrame(mayoronly.groupby(['screen_name'],as_index=False).mean()['sent']) snyderonly_avgsent=pd.DataFrame(snyderonly.groupby(['screen_name'],as_index=False).mean()['sent']) print(mayonly_avgsent.sent.mean()) print(snyderonly_avgsent.sent.mean()) # fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True) colorm = dict(boxes='magenta', whiskers='black', medians='black', caps='black') colorg = dict(boxes='lightgreen', whiskers='black', medians='black', caps='black') #matplotlib.style.use('fivethirtyeight') matplotlib.style.use('ggplot') plt.rcParams['axes.facecolor']='w' plt.rcParams['savefig.facecolor']='w' matplotlib.rcParams['font.size'] = 14 #plt.figure(num=None, figsize=(12, 8), facecolor='w', edgecolor='w') c = {'color':'black'} m = {'color':'black','linestyle':'-'} boxprops = dict(linestyle='-', color='black') bp = plt.boxplot([mayonly_avgsent,snyderonly_avgsent], patch_artist=True, whiskerprops=c,capprops=c,medianprops=m,boxprops=boxprops,labels=['Mayor','Governor']) for patch, color in zip(bp['boxes'], ['#348ABD','#E24A33']): patch.set_facecolor(color) ax = plt.gca() ax.set_ylabel('Sentiment score') ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey', alpha=0.5) ax.get_figure().savefig('../figs/box-mayor-gov.pdf',dpi=150,bbox_inches='tight') # In[217]: # Figure 4 mayonly_avgsent=pd.DataFrame(mayoronly.groupby(['screen_name'],as_index=False)['sent'].mean()) snyderonly_avgsent=pd.DataFrame(snyderonly.groupby(['screen_name'],as_index=False)['sent'].mean()) pro_may_avgent = mayonly_avgsent[mayonly_avgsent.sent>0].screen_name.unique() comment_both = snyderonly_avgsent[snyderonly_avgsent.screen_name.isin(pro_may_avgent)].screen_name.unique() ax=mayonly_avgsent[mayonly_avgsent.screen_name.isin(comment_both) & mayonly_avgsent.sent!=0].sent.plot(kind='density', xlim=(-1,1),color='#348ABD') snyderonly_avgsent[snyderonly_avgsent.screen_name.isin(comment_both) & snyderonly_avgsent.sent!=0 ].sent.plot(kind='density', ax=ax, xlim=(-1,1), color = '#E24A33') ax.legend(['Mayor','Governor'],loc=2) ax.set_xlabel('Sentiment score') ax.get_figure().savefig('../figs/pro_mayors_gov.pdf', bbox_inches='tight') # In[137]: plt.figure(num=None, figsize=(12, 6), facecolor='w', edgecolor='w') fil.cnt.plot(loglog=True,linestyle='',marker='.') fil.cpop.plot(loglog=True,linestyle='',marker='.') #fil.normalized.plot(loglog=True,linestyle='',marker='.') plt.legend(['tweet count','city population']); # In[98]: ax = fil.plot.scatter(x='cnt',y='cpop',figsize=(12,5)) fil[['cnt','cpop','CityST']].apply(lambda x: ax.text(*x),axis=1); plt.xlim(0,20000); plt.ylim(0,3000000); # In[12]: #compare the sentiments of blame and no-blame tweets from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() nobl = pd.DataFrame(list(l[l.c=='1'].text.apply(sid.polarity_scores)))['compound'] blame = pd.DataFrame(list(l[l.c!='2'].text.apply(sid.polarity_scores)))['compound'] nobl.plot.density() blame.plot.density() plt.legend(['no blame','blame']) plt.xlim(-1,1); plt.gcf().set_size_inches(6,2) #plt.gcf().savefig('../figs/blame-sentiment.pdf',dpi=150,bbox_inches='tight') # In[99]: blamers = [df[df.text.isin(l[l.c.str.contains(str(i))].text.unique())].screen_name.unique() for i in range(10)] blamers.append(df[df.text.isin(l[l.c == 'missing'].text.unique())].screen_name.unique()) blamers = pd.DataFrame(blamers).transpose() blamers.columns = labels blamers.apply(pd.Series.nunique) #number of (unique) blamers from collections import defaultdict from itertools import permutations r = defaultdict(dict) for a1,a2 in permutations(range(1,10), 2): s1 = set(df[df.screen_name.isin(blamers[labels[a1]])].text.unique()) s2 = set(df[df.screen_name.isin(blamers[labels[a2]])].text.unique()) r[labels[a1]][labels[a2]] = len(s1 & s2) / len(s2) plt.figure(num=None, figsize=(12, 8), facecolor='w', edgecolor='w') z = pd.DataFrame.from_dict(r) #what percentage of blamers of x also blame y sns.heatmap(z,annot=True,vmin=0,vmax=1,cmap='RdBu_r',annot_kws={'size':12}); # In[199]: #import nltk #nltk.download('wordnet') from nltk.corpus import wordnet as wn from collections import defaultdict synonyms = defaultdict(set) words = 'blame fault responsible fail resign jail prison sentence accountable liable cause accuse treason poison' for w in words.split(): for synset in wn.synsets(w): synonyms[w].update([lemma.name() for lemma in synset.lemmas()]) from pprint import pprint pprint(dict(synonyms),indent=2,width=300) # In[208]: blame_words = {'responsible': 'account responsible blame accus[ie] \sliab \scause', 'fault': 'fault error mistake flaw', 'reason': 'ignor negl[ie] accident discriminat intention ideology decisi', 'sentenced': 'arrest convict jail jug bars prison sentence', 'betrayed': 'betray traitor treason', 'resign': 'resign quit remove.+office leave.+office step\sdown', 'poison': 'poison'} bw = {} for k,v in blame_words.items(): bw[k] = v.split() bw = pd.DataFrame.from_dict(blame_words, orient='index').rename(columns={0:'blame words per category'}) print(bw.to_latex()) # In[212]: blame_words = {'responsible': 'account responsible blame accus[ie] \sliab \scause', 'fault': 'fault error mistake flaw', 'reason': 'ignor negl[ie] accident discriminat intention ideology decision', 'sentenced': 'arrest convict jail jug bars prison sentence', 'betrayed': 'betray traitor treason', 'resign': 'resign quit remove.+office leave.+office step\sdown', 'poison': 'poison'} blame_tw = {} #unique text blame_rt = {} #rt matters total = set() for k,v in blame_words.items(): indices = set.union(*[set(df[df.text.str.contains(w,case=False)].index) for w in v.split()]) blame_tw[k] = df.loc[indices,].text.nunique() blame_rt[k] = len(indices) total.update(indices) blame_rt['total'] = len(total) blame_tw['total'] = df.loc[total,].text.nunique() pd.DataFrame([blame_rt,blame_tw],index=['tweets in the dataset (RTs count)','# of tweets w/ unique text']) # In[213]: l = [v.split() for v in blame_words.values()] blame_filter = '|'.join([item for sublist in l for item in sublist]) blames = df[df.text.str.contains(blame_filter,case=False)].copy() blames = blames.replace({'\r': ' ','\n': ' '}, regex=True) # In[249]: # group by tweet text grouped = blames.groupby('text').size() g = grouped.reset_index().rename(columns={0:'RT'}) g = g.sort_values('RT',ascending=False) # In[196]: """ sample = g.sample(n=2000,random_state=3).sort_values('RT',ascending=False).copy() s1 = sample.sample(n=200,random_state=5) s2 = sample.sample(n=200,random_state=7) s3 = sample.sample(n=200,random_state=9) s4 = sample.sample(n=200,random_state=11) s5 = sample.sample(n=200,random_state=13) s6 = sample.sample(n=200,random_state=15) from itertools import combinations for p,q in combinations(range(1,5),2): p = 's'+str(p) q = 's'+str(q) print('|'+p+'|','∩','|'+q+'|','=',len(set(eval(p).index)&set(eval(q).index))) s1.to_csv('data/s1.csv',index=False) s2.to_csv('data/s2.csv',index=False) s3.to_csv('data/s3.csv',index=False) s4.to_csv('data/s4.csv',index=False) s5.to_csv('data/s5.csv',index=False) s6.to_csv('data/s6.csv',index=False) """ # In[30]: snyder = blames.text.str.contains('gov|nyder|onetoughnerd|bern',case=False) em = blames.text.str.contains('mgr|manager|Darnell|Earley|Kurtz',case=False) mayor = blames.text.str.contains('Dayne|Walling|ayor',case=False) obama = blames.text.str.contains('obama|POTUS',case=False) obama = obama & ~blames.text.str.contains('pledge|announc|nyder|governor',case=False) epa = blames.text.str.contains('\sEPA\s',case=False) republic = blames.text.str.contains('republic',case=False) democrat = blames.text.str.contains('democrat',case=False) # In[32]: def perform(fun, *args): return fun(*args) def meetmin(x,y): x = blames[x].screen_name y = blames[y].screen_name return 100 * len(set(x) & set(y)) / min(len(set(x)), len(set(y))) # In[33]: scores = [] s = snyder& ~(epa|obama|mayor|em) for y in [s,epa,obama,mayor,em]: for f in [meetmin]: scores.append({'EM':perform(f,em,y), 'Mayor':perform(f,mayor,y), 'President':perform(f,obama,y), 'EPA':perform(f,epa,y), 'Snyder':perform(f,s,y)}) pd.DataFrame(scores,index=['Also blame Snyder','Also blame EPA','Also blame President','Also blame Mayor','Also blame EM']) # ### Crowdsourcing # # We filtered our dataset using blame words and labeled one percent sample of the tweets manually regarding who the blame is attributed to. # # - 0: not a blame tweet # - 1: MI Gov. Rick Snyder @onetoughnerd # - 2: POTUS Obama # - 3: Flint Mayor Dayne Walling # - 4: EPA / Gina McCarthy # - 5: Emergency Managers: Darnell|Earley|Kurtz # - 6: Republicans (help crisis grow) # - 7: Democrats (help crisis grow) # - 8: Government # - 9: Other specified (not among the listed above). # # Reviwers also identified whether they were confused or unsure about who the tweet assigns blame to. # # # In[106]: import pandas as pd l = pd.DataFrame() for i in range(5): r = pd.read_csv('data/training/Flint'+str(i+1)+'_train.csv') r['rater'] = i l = l.append(r) l = l.fillna('missing') # In[23]: from mpl_toolkits.axes_grid.inset_locator import inset_axes inset_axes = inset_axes(parent_axes, width="30%", # width = 30% of parent_bbox height=1., # height : 1 inch loc=3) # In[117]: df = pd.read_csv('data/us-city-populations.csv',usecols=['CityST','2010','LAT','LON']) df2 = pd.read_csv('data/city_file.csv',dtype={'lat':str,'lon':str}) df2['CityST'] = df2.city + ', ' + df2.state merged = df.merge(df2, on = 'CityST', how = 'inner') merged['latlon'] = merged[['lat','lon']].apply(tuple, axis=1) # In[120]: merged.head() # In[260]: pos = {} neg = {} mean = {} for g in ('s','epa','obama','mayor','em'): pos[g]=len(blames[(blames.sp>0) & eval(g)].text.unique()) neg[g]=len(blames[(blames.sp<0) & eval(g)].text.unique()) mean[g] = blames[(blames.sp<0) & eval(g)].sp.mean() pd.DataFrame([pos,neg,mean],index=['pos tw unique','neg tw unique','mean']) # In[30]: with pd.option_context('display.max_colwidth', 114): print(blames[epa].text[:30].to_string(index=False)) # In[21]: from wordcloud import WordCloud, STOPWORDS from scipy.misc import imread from PIL import Image import numpy as np import calendar #mask = imread('twitter_mask.png', flatten=True) mask = np.array(Image.open("twitter_mask.png")) wc = WordCloud(mask=mask,background_color='white',stopwords=STOPWORDS,width=2200,height=1400).generate(words) plt.figure().suptitle(calendar.month_name[month]+', 2016') plt.axis('off') plt.imshow(wc) plt.savefig('figs/wc_'+calendar.month_name[month]+'.png', dpi=300, bbox_inches='tight')