#!/usr/bin/env python # coding: utf-8 # # Women's March and Tea Party, by the Numbers # # The Tea Party protests that took the country by storm in 2009 had an outsized impact on the legislative process. The recent Women's March and associated movement could potentially have a similar effect, so I was curious to see how the two compared in size and location. Below, I look at the distribution and size of the marches, compare turnout by city, then look at the portion of each state that attended protests. # # Overall, there were ten times more Women's Marchers (**4,157,678**) than Tea Party marchers (**310,960**). Interestingly, both protests had a similar median number of marchers (**322** vs **450**), although the mean was substantially higher for the Women's March (**6673** vs. **903**). Finally, almost every state had a larger percentage of the population turnout for the Women's March, with Colorado leading the way at **2.9%**. This means that although the march was more concentrated in cities, it was still a grassroots event distributed geographically throughout the 50 states. # # If the energy Women's March can be harnessed, it could have an even larger impact than the Tea Party. We may already be seeing the results in congress and town halls. # # # If you're viewing this notebook on Github, view it in NBViewer [here](http://nbviewer.jupyter.org/gist/psthomas/79b61a107205a90b3660bb4649fb2672) instead to see the interactive plots and tables. # In[97]: get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib import statsmodels.formula.api as smf import statsmodels.api as sm import json from IPython.display import HTML matplotlib.style.use('ggplot') # # Import the Data # # Jeremy Pressman, Erica Chenoweth and others recently finished compiling all the Women's March [data](https://docs.google.com/spreadsheets/d/1xa0iLqYKz8x9Yc_rfhtmSOJQ2EGgeUVjvV4A8LsIaxY/htmlview?sle=true#gid=0) and 538 compiled [data](https://fivethirtyeight.com/features/tea-parties-appear-to-draw-at-least/) on the Tea Party protests a few years ago, so I'll be using both those sources. I got the state level population [data](https://factfinder.census.gov/faces/tableservices/jsf/pages/productview.xhtml?pid=PEP_2015_PEPANNRES&src=pt) from the US Census, and the voter turnout [data](https://docs.google.com/spreadsheets/d/133Eb4qQmOxNvtesw2hdVns073R68EZx4SfCnP4IGQf8/htmlview?sle=true#gid=19) from David Wasserman. All these sources are available in a zipped file [here](https://www.dropbox.com/s/4f2tccm0urnt97f/march_data.zip?dl=1). # # In[122]: #Read in Tea Party data tea_df = pd.read_csv('data/tea_party.csv', sep='\t', encoding='utf-8', index_col=False) tea_df.rename(columns={'number': 'tea_num'}, inplace=True) #Sum any cities with two reported protests tea_df = tea_df.groupby(by=['city', 'state'], as_index=False).sum() # In[123]: #Read in Women's March data. march_df = pd.read_csv('data/womens_march.csv', encoding='utf-8', index_col=False) march_df['Location'] = march_df['Location'].str.split(',', expand=True)[0] march_df.replace({',': ''}, regex=True, inplace=True) march_df = march_df.apply(pd.to_numeric, errors='ignore') march_df = march_df.loc[:, ['Location', 'State/Territory', 'Best Guess']] march_df.rename(columns={'Location':'city', 'State/Territory': 'state', 'Best Guess':'march_num'}, inplace=True) #Sum any cities with two protests march_df = march_df.groupby(by=['city', 'state'], as_index=False).sum() march_df['city'] = march_df['city'].replace({'Washington DC': 'Washington'}) # In[124]: # Import and parse the state population data states = {'Mississippi': 'MS', 'Northern Mariana Islands': 'MP', 'Oklahoma': 'OK', 'Wyoming': 'WY', 'Minnesota': 'MN', 'Alaska': 'AK', 'American Samoa': 'AS', 'Arkansas': 'AR', 'New Mexico': 'NM', 'Indiana': 'IN', 'Maryland': 'MD', 'Louisiana': 'LA', 'Texas': 'TX', 'Tennessee': 'TN', 'Iowa': 'IA', 'Wisconsin': 'WI', 'Arizona': 'AZ', 'Michigan': 'MI', 'Kansas': 'KS', 'Utah': 'UT', 'Virginia': 'VA', 'Oregon': 'OR', 'Connecticut': 'CT', 'District of Columbia': 'DC', 'New Hampshire': 'NH', 'Idaho': 'ID', 'West Virginia': 'WV', 'South Carolina': 'SC', 'California': 'CA', 'Massachusetts': 'MA', 'Vermont': 'VT', 'Georgia': 'GA', 'North Dakota': 'ND', 'Pennsylvania': 'PA', 'Puerto Rico': 'PR', 'Florida': 'FL', 'Hawaii': 'HI', 'Kentucky': 'KY', 'Rhode Island': 'RI', 'Nebraska': 'NE', 'Missouri': 'MO', 'Ohio': 'OH', 'Alabama': 'AL', 'Illinois': 'IL', 'Virgin Islands': 'VI', 'South Dakota': 'SD', 'Colorado': 'CO', 'New Jersey': 'NJ', 'National': 'NA', 'Washington': 'WA', 'North Carolina': 'NC', 'Maine': 'ME', 'New York': 'NY', 'Montana': 'MT', 'Nevada': 'NV', 'Delaware': 'DE', 'Guam': 'GU'} def convert_state(element): if element in states.keys(): return states[element] else: return np.nan pop_df = pd.read_csv('data/state_population.csv', encoding='utf-8', index_col=False) # 53, Includes u'Puerto Rico' and u'District of Columbia', u'United States' pop_df['state'] = pop_df['geography'].apply(convert_state) pop_df = pop_df[['state', 'pop2016']] # In[125]: #Import and parse the voting data, by state vote_df = pd.read_csv('data/national_vote.csv', encoding='utf-8', index_col=False) vote_df.replace({'%': '', '\*':'', ',':'', }, regex=True, inplace=True) vote_df = vote_df.apply(pd.to_numeric, errors='ignore') vote_df['state'] = vote_df['State'].apply(convert_state) vote_df = vote_df[['state', "Dem '16 Margin"]] vote_df.rename(columns={"Dem '16 Margin":'margin2016'}, inplace=True) #print len(list(vote_df['state'])) #51 Includes DC # # Marchers by City # # First, I look at this data by city. The boxplot shows that the median march size was actually very similar between cities (**322** vs **450**). The mean, however, was an order of magnitude higher for the Women's March (**6673**), and there are more outliers at the high end of the march size. There were also ten times more Women's Marchers (**4,157,678**) than Tea Party marchers (**310,960**). # In[126]: #Merge dataframes on city, state city_df = march_df.merge(tea_df, how='outer', on=['city', 'state']) #Copy for distributions unfcity_df = city_df.copy() # Fill 0, assume cities without data had no marchers. # Note, it's possible the 538 data is less complete than Women's March. city_df.fillna(value=0, inplace=True) #Boxplot fig, ax = plt.subplots() ax.set_yscale('symlog') ax.set_ylim(1, 1e6) unfcity_df.plot.box(figsize=(10,7), ax=ax, meanline=True, showmeans=True, color='gray', sym='k.') plt.ylabel("Protesters (log)") plt.show() #Print total marchers print("Total Women's March: " + '{:,.0f}'.format(city_df['march_num'].sum())) print("Total Tea Party: " + '{:,.0f}'.format(city_df['tea_num'].sum())) unfcity_df.describe() # ## Cities Compared # # Below is an interactive scatter plot of the number of protesters in each city for each movement. This is created using an outer join, so the assumption is that any city not shared between the two lists had no marchers. # # Any city above the 45 degree line had more Tea Party Marchers, and those below had more Women's Marchers. These are log axes, so the cities do skew substantially towards the Women's march (especially the large ones). # In[127]: extrajs = ''' svg.append("line") .attr("x1", x(1)) .attr("y1", y(1)) .attr("x2", x(1e8)) .attr("y2", y(1e8)) .attr("stroke-width", 1.25) .attr("stroke", "#888") //#999 #fff .attr("opacity", "0.6") //.attr("fill", "none") //.style("stroke-dasharray", ("10, 10")) .attr("class", "trendline") .attr("clip-path", "url(#clip)"); ''' tooltip = ''' "Location: " + d[keys.city] + ", " + d[keys.state] + "
Women's March: " + fmtTh(+d[keys.march_num]) + "
Tea Party: " + fmtTh(+d[keys.tea_num]) ''' settings = {"x_label": "Women's March (log)", "y_label": "Tea Party (log)", "x": 'march_num' , "y": 'tea_num', "tooltip": tooltip, "extrajs": extrajs} interactive_log_scatter(city_df, settings=settings) # In[128]: interactive_table(city_df.sort_values(by='march_num',ascending=False), width=400, height=500) # ## Binned and Counted # # This makes it clear that the majority of both protests took place in groups of 200,000 or less, and that the Women's March dwarfed the Tea Party marches. # In[129]: # Bin by march size and sum: fig, ax = plt.subplots() set_bins = [-1, 2e5, 4e5, 6e5, 8e5] groups = city_df.groupby(pd.cut(city_df['march_num'], set_bins)) groups_df = groups.sum() groups_df.plot.bar(figsize=(10,7), color=['steelblue', 'red'], alpha=0.6, ax=ax) plt.ylabel("Cumulative Marchers") plt.xlabel("March Size") plt.xticks(rotation=90) ax.set_xticklabels(['200k','400k','600k', '800k']) plt.show() # # Marchers by State # # Next, I look at the marchers grouped by state. Every state except West Virginia had a larger percentage participate in the Women's March, with Colorado leading with **2.9%** of their population. California had the largest total number of protesters, at **910,830**. # In[130]: # Group city_df by state, sum state_df = city_df.groupby(by='state', as_index=False).sum() # Merge with vote and population dataframes: state_df = state_df.merge(vote_df, how='inner') state_df = state_df.merge(pop_df, how='inner') state_df['tea_pct'] = (state_df['tea_num'] / state_df['pop2016']) * 100 state_df['march_pct'] = (state_df['march_num'] / state_df['pop2016']) * 100 # Leave DC out, marchers exceed population state_df = state_df[state_df['state'] != 'DC'] state_df = state_df.sort_values(by='march_pct', ascending=False).reset_index(drop=True) interactive_table(state_df, width=600, height=500) # ## How do state turnouts compare? # In[131]: fig, ax = plt.subplots(figsize=(10,8)) plt.scatter(x=state_df['march_pct'], y=state_df['tea_pct'], marker='', alpha=0.7, color="steelblue", label='_nolegend_') #marker='o' A = state_df['march_pct'] B = state_df['tea_pct'] C = state_df['state'] D = range(len(C)) for a,b,c,d in zip(A, B, C, D): #if d % 50 == 0: #Annotate every n ax.annotate('%s' % c, xy=(a,b), textcoords='data') plt.xlabel("Women's Marchers (% Population)") plt.ylabel("Tea Party Marchers (% Population)") x = pd.DataFrame({'line': np.linspace(0, 3, 10)}) plt.plot(x, x, 'k--', alpha=0.7, label='Equal (1:1)') # Average State Ratio = 0.008984/0.001222 = 7.35 times % of women's marchers plt.plot(x, x/7, '--', color="gray", alpha=0.8, label='Average State Ratio (7:1)') ax.set_xlim(0,3.0) ax.set_ylim(0,0.5) plt.legend() plt.show() # ## Did blue states have more marchers? # # The Democratic margin is a fairly good indicator for the Women's March participation. Some states overperformed (CA, OR, MA, VT, WA, IL) or underperformed the linear regression line, although some of the underperformers are states adjacent to DC. # In[132]: fig, ax = plt.subplots(figsize=(10,8)) #figsize=(12,10) plt.scatter(x=state_df['margin2016'], y=state_df['march_pct'], marker='', alpha=0.9, color="steelblue", label='_nolegend_') A = state_df['margin2016'] B = state_df['march_pct'] C = state_df['state'] D = range(len(C)) for a,b,c,d in zip(A, B, C, D): #if d % 50 == 0: #Annotate every n ax.annotate('%s' % c, xy=(a,b), textcoords='data') plt.xlabel("2016 Democratic Margin") plt.ylabel("Women's Marchers (% Population)") # 1st order polynomial poly_1 = smf.ols(formula='I(march_pct) ~ 1 + margin2016', data=state_df).fit() #, missing='drop' x = pd.DataFrame({'margin2016': np.linspace(-60, 40, 10)}) plt.plot(x, poly_1.predict(x), color="black", label='Poly n=1 $R^2$=%.2f' % (poly_1.rsquared), alpha=0.6) ax.set_xlim(-60,40) ax.set_ylim(-0.5, 3.1) plt.legend() plt.show() # http://www.politico.com/story/2016/11/senate-democrats-2018-midterms-231516 # Republicans are targeting a quintet of senators from conservative states where Trump # walloped Hillary Clinton: Montana, Missouri, Indiana, North Dakota and West Virginia. # The GOP could amass a filibuster-proof majority by running the table in those # states and other battlegrounds. # IN, MS underperformed # MT overperformed trend line, # ND, WV did about as expected. #MD, NJ, RI, CT, VA, DE might all have underperformed because people were in DC # DC has more than it's population in protest 1.00 # In[133]: fig, ax = plt.subplots(figsize=(10,8)) plt.scatter(x=state_df['margin2016'], y=state_df['tea_pct'], marker='', color='gray', alpha=0.9, label='_nolegend_') A = state_df['margin2016'] B = state_df['tea_pct'] C = state_df['state'] D = range(len(C)) for a,b,c,d in zip(A, B, C, D): #if d % 50 == 0: #Annotate every n ax.annotate('%s' % c, xy=(a,b), textcoords='data') plt.xlabel("2016 Democratic Margin") plt.ylabel("Tea Party Marchers (% Population)") # 1st order polynomial poly_1 = smf.ols(formula='I(tea_pct) ~ 1 + margin2016', data=state_df).fit() #, missing='drop' x = pd.DataFrame({'margin2016': np.linspace(-60, 40, 10)}) plt.plot(x, poly_1.predict(x), 'k-', label='Poly n=1 $R^2$=%.2f' % (poly_1.rsquared), alpha=0.6) ax.set_xlim(-60,40) ax.set_ylim(-0.1,0.5) plt.legend() plt.show() # # Code for Visualizations # In[89]: def interactive_log_scatter(df, settings): srcdoc = r''' Zoom + Pan
''' width=960 height=600 srcdoc = srcdoc.replace('||datainsert||', df.to_json(orient="values")) key_list = list(df) key_dict = {i: key_list.index(i) for i in key_list} srcdoc = srcdoc.replace('||keys||', json.dumps(key_dict) ) for s in settings.keys(): srcdoc = srcdoc.replace('||{0}||'.format(s), str(settings[s])) srcdoc = srcdoc.replace('"', '"') embed = HTML(''.format(srcdoc, width, height)) return embed # In[2]: def interactive_table(df, width, height): srcdoc = r'''
''' srcdoc = srcdoc.replace('||headinginsert||', json.dumps(list(df))) srcdoc = srcdoc.replace('||datainsert||', df.to_json(orient="values")) srcdoc = srcdoc.replace('"', '"') html = ''' '''.format(srcdoc, width, height) #width: 100%; margin: 25px auto; embed = HTML(html) return embed