#!/usr/bin/env python # coding: utf-8 # ## Measuring GeoPopularity of US Governors using Social Media # # ## Research Questions # # 1. How can cyber popularities of the incumbent governors be measured? # 2. How can physical popularities of incumbent governors be measured? # 3. How can the relationship between incumbent governors’ physical-world and cyber-world popularities be calculated and visualized? # 4. To what extent US cities are interested in governors’ politics on the social media? # In[11]: import pandas as pd import numpy as np import folium from IPython.display import HTML get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.style.use('ggplot') # In[12]: # read in election and twitter datasets in tabular format df = pd.read_csv('../data/governors-challengers.csv') ch = pd.read_table('../data/tw_ch.csv',usecols=['screen_name','followers_count'], encoding='utf-16') gov = pd.read_table('../data/tw_gov.csv',usecols=['screen_name','followers']) #rename column names for clarification and seamless merging ch = ch.rename(columns={'screen_name':'twch','followers_count':'folch'}) gov = gov.rename(columns={'screen_name':'twgov','followers':'folgov'}) # In[24]: # map of Incumbent Governors Vote Shares (%) mapname = 'gov_share' state_geo = 'us_states.geojson' states = folium.Map(location=[40, -99], zoom_start=4) states.geo_json(geo_path=state_geo, data=df, data_out=mapname+'.json', columns=['state', 'shareGov'], threshold_scale=[45,50,55,60,65,70], key_on='feature.properties.name', fill_color='PuRd', fill_opacity=0.7, line_opacity=0.2, legend_name='Incumbent Governor Vote Share (%)') states.create_map(path=mapname+'.html') HTML('') # In[16]: #let's merge on screen names df = df.merge(gov) df = df.merge(ch) # governors' twitter followers share (only considering the primary challenger) df['twshare'] = 100 * df.folgov / (df.folgov + df.folch) # similarly, update shareGov field df['shareGov'] = 100 * df.shareGov / (df.shareGov + df.shareCh) # get the vote share - twitter share diff df['shareDiff'] = abs(df['twshare'] - df['shareGov']) df # In[23]: states = folium.Map(location=[40, -99], zoom_start=4) states.geo_json(geo_path=state_geo, data=df, data_out='govtw_share.json', columns=['state', 'twshare'], threshold_scale=[50,60,70,80,90,99], key_on='feature.properties.name', fill_color='PuRd', fill_opacity=0.7, line_opacity=0.2, legend_name='Incumbent Governors Twitter Follower Share (%)') states.create_map(path='govtw_share.html') HTML('') # In[71]: mapname = 'share_diff' states = folium.Map(location=[40, -99], zoom_start=4) states.geo_json(geo_path=state_geo, data=df, data_out=mapname+'.json', threshold_scale=[5, 10, 15, 20, 30, 40], columns=['state', 'shareDiff'], key_on='feature.properties.name', fill_color='YlGnBu', fill_opacity=0.7, line_opacity=0.2, legend_name='The Difference Between Governors Vote Share & Twitter Follower Share (%)') states.create_map(path=mapname+'.html') HTML('') # In[7]: ax = df[['twshare','shareGov']].plot(x = df['state'], xticks=range(len(df)), rot=75,figsize=(15,3)) ax.legend(['Twitter share','Vote share'],loc='best'); # In[5]: # correlation between twitter share of governors # and their 'normalized' vote share df[['twshare','shareGov']].corr() # In[28]: ax = df.plot(x='twshare', y='shareGov', kind='scatter', figsize=(15,15),xlim=(0,100),ylim=(50,80)) df.apply(lambda x: ax.annotate(x['state'], (x['twshare'],x['shareGov']), xytext=(-40, 7), textcoords='offset points',fontsize=14), axis=1); ax.set_xlabel("Normalized Vote Share of Sitting Governor",fontsize=14); ax.set_ylabel("Normalized Twitter Follower Share of Sitting Governor",fontsize=14); ax.set_title('Twitter Follower Share vs Vote Share of Sitting US Governors',fontsize=18); ax.text(0.01,0.99,'Correlation between Twitter share and vote share : '+ '{:2.2f}'.format(df[['twshare','shareGov']].corr().ix[0,1]), horizontalalignment='left',verticalalignment='top', transform=ax.transAxes,fontsize=12); # In[9]: # perform clustering and plot the dendrogram from scipy.cluster.hierarchy import linkage, dendrogram from scipy.spatial.distance import pdist, squareform # compute distance matrix distxy = squareform(pdist(df[['twshare','shareGov']], metric='euclidean')) ax = plt.subplot(111) ax.figure.set_size_inches(5,10) l = linkage(distxy,method='complete') dendrogram(l,labels=df['state'].tolist(),orientation='right'); # In[10]: # states whose incumbent governors have less followers than their challengers... df[df['folch']>df['folgov']] # In[84]: import json from collections import Counter f = json.load(open('../data/geocoded.json')) c2 = Counter() c = Counter() for state in f: #print(state,f[state]['flocs']) c2.update(f[state]['flocs'].keys()) c.update(f[state]['flocs']) c2.most_common(10) # In[112]: fig = plt.figure(figsize=(16,8)) ax = plt.gca() ax.scatter(range(len(c2)),sorted(list(c2.values()),reverse = True)) #ax.set_yscale('log') #ax.set_xscale('log') ax.set_xlim(-100,len(c2)+100) ax.set_ylim(0,c2.most_common(1)[0][1]+1) ax.set_xlabel('Cities (ranked)') ax.set_ylabel('Unique governors followed') ax.set_title('Number of Unique Governors Followed by US cities'); # In[77]: plt.loglog(range(len(c2)),sorted(list(c2.values()),reverse = True)); # In[122]: import powerlaw fit = powerlaw.Fit(list(c.values())) print(fit.alpha,fit.sigma) #print(fit.distribution_compare('power_law', 'lognormal')) # In[113]: fig = plt.figure(figsize=(16,8)) ax = plt.gca() ax.scatter(range(len(c)),sorted(list(c.values()),reverse = True)) ax.set_yscale('log') #ax.set_xscale('log') ax.set_xlim(-100,len(c)+100) ax.set_ylim(0.9,c.most_common(1)[0][1]+5000) ax.set_xlabel('Cities (ranked)') ax.set_ylabel('Residents following at least one governor') ax.set_title('Total Number of Residents Following at least one Governor'); # In[115]: # http://code.xster.net/pygeocoder/wiki/Home from pygeocoder import Geocoder for i in range(10): x = c2.most_common(10)[i] results = Geocoder.reverse_geocode( float(x[0].split(',')[0]), float(x[0].split(',')[1])) print(results, '\t # of govs followed:',x[1]) # In[13]: states = folium.Map(location=[40, -99], zoom_start=4, tiles='Mapbox Bright') for k,v in c2.items(): states.circle_marker(location=k.split(','), radius=v*10) states.create_map(path='unique_govs.html') HTML('') # In[14]: import json from collections import Counter f = json.load(open('../data/geocoded.json')) c = Counter() for state in f: #print(state,f[state]['flocs']) c.update(f[state]['flocs']) # In[15]: # http://code.xster.net/pygeocoder/wiki/Home from pygeocoder import Geocoder for i in range(10): x = c.most_common(10)[i] lat,lon = x[0].split(',') results = Geocoder.reverse_geocode(float(lat), float(lon)) print(results, '\t # of followers:',x[1]) # In[16]: states = folium.Map(location=[40, -99], zoom_start=4, tiles='Mapbox Bright') for k,v in c.items(): states.circle_marker(location=k.split(','), radius=v) states.create_map(path='all_followers.html') HTML('') # In[17]: from mpl_toolkits.basemap import Basemap from matplotlib.path import Path # Mercator Projection # http://matplotlib.org/basemap/users/merc.html m = Basemap(projection='merc', llcrnrlat=-80, urcrnrlat=80, llcrnrlon=-180, urcrnrlon=180, lat_ts=20, resolution='c') # Poly vertices p = [[25.774252, -80.190262], [18.466465, -66.118292], [32.321384, -64.75737]] # Projected vertices p_projected = [m(x[1], x[0]) for x in p] # Create the Path p_path = Path(p_projected) # Test points p1 = [27.254629577800088, -76.728515625] p2 = [27.254629577800088, -74.928515625] # Test point projection p1_projected = m(p1[1], p1[0]) p2_projected = m(p2[1], p2[0]) print(p_path.contains_point(p1_projected)) print(p_path.contains_point(p2_projected))