#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import random import matplotlib.pyplot as plt plt.style.use('ggplot') from collections import Counter # ### Converting state polling average to probability # The state polling average is the average from multiple polling companies using different margins of error. For example, if a candidate has a polling average of 50% with an assumed margin of error of +/- 4, that means the truth is in the range of 46 to 54. Their opponent has a polling average of 46% with an assumed margin of error of +/- 4, that means the truth is in the range of 42 to 50. # ![example7](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/ex7.png) # **This means we can have one candidate leading in the polls but ultimately lose the election. # ![example8](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/ex8.png) # It also means we can have the candidate that lead in the polls win larger than expected.** # ![example9](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/ex9.png) # Polling average alone does not give us the probability of a candidate winning. The candidate in the lead would win 100% of the time. # In[2]: x = 50 y = 46 wins = 0 # number of wins x over y number_of_elections = 10000 for i in range(number_of_elections): victory = x - y if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # Instead, we will randomly add the margin of error to a candidate's polling average. # In[3]: n = 4 random.choice(range((n*-1),(n+1),1)) # In[4]: x = 50 y = 46 n = 4 wins = 0 number_of_elections = 10000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) # random selection of margin or error for candidate x moe_y = random.choice(range((n*-1),(n+1),1)) # different margin of error for candidate y x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # Note that a candidate with a 4 point lead only wins 81% of the time. Keep in mind that a tie is not a victory. # What if we run it again? Will the percentage stay the same? # In[5]: x = 50 y = 46 n = 4 wins = 0 # number of wins x over y number_of_elections = 10000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) # random selection of margin or error for candidate x moe_y = random.choice(range((n*-1),(n+1),1)) # different margin of error for candidate y x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: # cannot be a tie wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # It is still about 81%. Let's try doubling the number of simulated elections. # In[6]: x = 50 y = 46 n = 4 wins = 0 number_of_elections = 20000 # previously we used 10,000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) moe_y = random.choice(range((n*-1),(n+1),1)) x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # In[7]: x = 50 y = 46 n = 4 wins = 0 number_of_elections = 20000 # previously we used 10,000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) moe_y = random.choice(range((n*-1),(n+1),1)) x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # What about 100,000? # In[8]: x = 50 y = 46 n = 4 wins = 0 number_of_elections = 100000 # previously we used 20,000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) moe_y = random.choice(range((n*-1),(n+1),1)) x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # In[9]: x = 50 y = 46 n = 4 wins = 0 number_of_elections = 100000 # previously we used 20,000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) moe_y = random.choice(range((n*-1),(n+1),1)) x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # 1,000,000? # In[10]: x = 50 y = 46 n = 4 wins = 0 number_of_elections = 1000000 # previously we used 100,000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) moe_y = random.choice(range((n*-1),(n+1),1)) x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # In[11]: x = 50 y = 46 n = 4 wins = 0 number_of_elections = 1000000 # previously we used 100,000 for i in range(number_of_elections): moe_x = random.choice(range((n*-1),(n+1),1)) moe_y = random.choice(range((n*-1),(n+1),1)) x1 = x + moe_x y1 = y + moe_y victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' print('Candidate x wins:', wins_percentage,'of', number_of_elections,'elections') # We have the probability stabilizing, but do we need to do Monte Carlo simulations or can we calculate the probability to get an accurate number? Let's look at the range again. # ![example7](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/ex7.png) # Candidate x can end up winning 46, 47, 48, 49, 50, 51, 52, 53, or 54 votes. # Candidate y can end up winning 42, 43, 44, 45, 46, 47, 48, 49, or 50 votes. # A margin of error of 4 means both candidates have a 1 out of 9 probability (4 below average, average, 4 above). # # Candidate y can only win with 47, 48, 49, or 50 (a 4 out of 9 probability) **but** it is dependent on candidate x. # # For candidate y to win with a 47 (1 out of 9 probability), candidate y must get 46 (1 out of 9 probability). # For candidate y to win with a 48 (2 out of 9 probability), candidate y must get 47 or less (2 out of 9 probability). # For candidate y to win with a 49 (3 out of 9 probability), candidate y must get 48 or less (3 out of 9 probability). # For candidate y to win with a 50 (4 out of 9 probability), candidate y must get 49 or less (4 out of 9 probability). # We can do the complicated combinatorics or we can do a quick Monte Carlo simulation that gets us pretty close. # In[12]: def moe(x, y, n): moe_x = random.choice(range((n*-1),(n+1),1)) moe_y = random.choice(range((n*-1),(n+1),1)) x += moe_x y += moe_y return x, y def sim(x, y, n, number_of_elections=20000): x = int(x) y = int(y) n = int(n) wins = 0 for i in range(number_of_elections): x1, y1 = moe(x, y, n) victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = str((float(wins) / number_of_elections)*100)+'%' return wins_percentage # Let's stick with our previous example of candidate x averaging 50 and candidate y averaging 46 with a margin of error of 4 # In[13]: x = 50 y = 46 n = 4 sim(x,y,n) # ### Margin of Error # # Let's take a quick look how those probabilities change as we increase the margin of error # In[14]: x = 50 y = 46 n = 4 print('Candidate x winning probability with a MoE of 4:', sim(x,y,n)) n = 5 print('Candidate x winning probability with a MoE of 5:', sim(x,y,n)) n = 6 print('Candidate x winning probability with a MoE of 6:', sim(x,y,n)) n = 7 print('Candidate x winning probability with a MoE of 7:', sim(x,y,n)) n = 8 print('Candidate x winning probability with a MoE of 8:', sim(x,y,n)) n = 9 print('Candidate x winning probability with a MoE of 9:', sim(x,y,n)) n = 10 print('Candidate x winning probability with a MoE of 10:', sim(x,y,n)) # We are using **unweighted** polling averages. This means that every poll entered into the average is treated like all others. Polling aggregators **claim** they weight individual polls in order to produce correct probabilities. # # It is universally understood that a prediction above 50% is a prediction for that outcome. The strength of that prediction is how far above 50% that probability may be. # # For example, a 99% probability is a more confident prediction than 65%. # # To clarify: # **Prediction is above 50% # Probability is confidence in the prediction** # # Applying State Probabilities to the 2016 Election # In[15]: df = pd.read_csv('backtest2016.csv') # We need to decide what margin of error to use. We can take a guess or we can look at the actual results. # Let's create a margin of victory column and see what we have. # In[16]: df['spread'] = abs(df['trump_rcp'] - df['clinton_rcp']) # In[17]: df.plot(x='state',y='spread',kind='bar',figsize=(16,4), title='Polling Average Spread by State/District',legend=False) # We can see from the chart above there are states that are "safe states" for candidates. These safe states are states in which one candidate does not overlap the other within the margin of error. In other words, one candidate's *lowest* possible result is above another candidate's *highest* possible result. # ![example12](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/ex12.png) # Let's refer to the qualitative experts to find a cutoff for what constitutes a "safe state." For this we will use the Cook Political Report. # In[18]: qual = pd.read_csv('aggpreds2016.csv') qual = qual[['State','Cook']] rep_safe = list(qual[qual['Cook'].str.startswith('Solid R', na=False)]['State'].values) qual[qual['Cook'].str.startswith('Solid R', na=False)] # In[19]: dem_safe = list(qual[qual['Cook'].str.startswith('Solid D', na=False)]['State'].values) qual[qual['Cook'].str.startswith('Solid D', na=False)] # In[20]: spread_states = df[df['spread'] >= 12]['state'].values df[df['spread'] >= 12]['state'] # In[21]: for state in spread_states: if state in rep_safe or state in dem_safe: pass else: print(state) # If we use 6 as our margin of error, we can have all the states above 12 land in the "safe state" category which will be represented by a probability of 100% for the safe candidate and 0% for the opponent. # In[22]: trump_x = df['trump_rcp'].values clinton_y = df['clinton_rcp'].values clinton_x = df['clinton_rcp'].values trump_y = df['trump_rcp'].values # In[23]: def sim(x, y, n, number_of_elections=20000): x = int(x) y = int(y) n = int(n) wins = 0 for i in range(number_of_elections): x1, y1 = moe(x, y, n) victory = x1 - y1 if victory >= 1: wins += 1 wins_percentage = float(wins) / number_of_elections return wins_percentage # In[24]: trump_rcp_prob = [] for x, y in zip(trump_x, clinton_y): nwp = sim(x,y,6) trump_rcp_prob.append(100*nwp) clinton_rcp_prob = [] for x, y in zip(clinton_x, trump_y): nwp = sim(x,y,6) clinton_rcp_prob.append(100*nwp) df['trump_rcp_prob'] = trump_rcp_prob df['clinton_rcp_prob'] = clinton_rcp_prob # In[25]: df.head() # # The Electoral College # ![example13](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/ex13.png) # Now that we have probabilities assigned to the states and districts, we can bring in the Electoral College scoring system. # For more detail on how it works, you can read about it [here](https://en.wikipedia.org/wiki/Electoral_college) # In[26]: print('Maximum electoral college votes possible:', np.sum(df['ec'])) print('Minimum electoral college votes to win:', int((np.sum(df['ec'])/2) + 1) ) # Let's see which of the two candidates are in the lead according to our probabilities # In[27]: print('Trump Most Likely Electoral College Results::', np.sum(df['ec'][df['trump_rcp_prob'] >= 50])) print() print('Clinton Most Likely Electoral College Results::', np.sum(df['ec'][df['clinton_rcp_prob'] >= 50])) # In[28]: print('538 - (272+230):',538 - (272+230) ) # ### Which states are missing? # In[29]: unc_states = list(df[ (df['trump_rcp_prob'] <= 50) & (df['clinton_rcp_prob'] <= 50) ]['state'].values) df[ (df['trump_rcp_prob'] <= 50) & (df['clinton_rcp_prob'] <= 50) ][['state','ec']] # This is another reason why the 2016 Presidential Election was unique. We have two states and a district where neither major candidate had a 50% probability of winning. These are statistically "tossup states" since we can assign a probability but not a prediction. # # Reusing our Simulation to Create Election Winning Probabilities # Before we can simulate the electoral college results, we need to address something unique with our probability assignments. A state awarding its electoral college votes to a candidate should be 100%, yet we have a number of states and districts that do not total 100%. # # Let us determine this probability for each state and make that our uncertainty probability. # In[30]: df['unc_prob'] = 100 - (df['trump_rcp_prob'] + df['clinton_rcp_prob']) # In[31]: df[['state','trump_rcp_prob','clinton_rcp_prob','unc_prob']].sort_values('unc_prob',ascending=False) # ### Simulating One Election # In[32]: ec = list(df.ec.values) states = list(df.state.values) cand = list(df['trump_rcp_prob'].values) sim_election = np.random.uniform()*100 ec_total = 0 for x, y, z in zip(cand, states, ec): if x > sim_election: print('Won',y,'for',z,'electoral college votes') ec_total += z print() print('Final Results:',ec_total,'electoral college votes') # ### Making state-by-state outcomes just a bit more realistic # # The challenge that comes with simulations is that if it is statistically possible then it will likely show up in the simulation. Whether to leave the possibility open ultimately becomes a matter of judgement. For this simulation, we will make any state above a 5% probability eligible for a candidate to win in the simulation. This is inline with out "safe state" assessments from the qualitative experts. # In[33]: ec = list(df.ec.values) states = list(df.state.values) cand = list(df['trump_rcp_prob'].values) sim_election = np.random.uniform(low=0.05)*100 ec_total = 0 for x, y, z in zip(cand, states, ec): if x > sim_election: print('Won',y,'for',z,'electoral college votes') ec_total += z print() print('Final Results:',ec_total,'electoral college votes') # In[34]: ec = list(df.ec.values) states = list(df.state.values) cand = list(df['clinton_rcp_prob'].values) sim_election = np.random.uniform(low=0.05)*100 ec_total = 0 for x, y, z in zip(cand, states, ec): if x > sim_election: print('Won',y,'for',z,'electoral college votes') ec_total += z print() print('Final Results:',ec_total,'electoral college votes') # ### Swing States # We will go back to the qualitative experts in order to determine swing states. # ![example14](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/ex14.png) # In[35]: qual[qual['Cook'].str.startswith('Lea', na=False)] # In[36]: qual[qual['Cook'].str.startswith('Tos', na=False)] # We now have 13 swing states and two swing districts. The four toss ups will get their own simulation results independent of each other. For the remaining swing states, we can either group according to the qualitative experts or group according to the qualitative experts and geography. Combining the qualitative experts with state geography allows us to capture regional attitudes, interests, and opinions. For example, Utah and Arizona have different public opinion polling than Colorado and Nevada on the issue of legalized marijuana. # In[37]: swing_states = ['Michigan', 'Wisconsin', 'Pennsylvania', 'Colorado', 'New Hampshire', 'Nevada', 'Ohio', 'Iowa', 'Utah', 'Georgia', 'Arizona','Florida', 'North Carolina','Maine CD2','Nebraska CD2'] # In[38]: lean_r_west = ['Utah','Arizona'] lean_d_west = ['Colorado','Nevada'] lean_r_lakes = ['Ohio', 'Iowa'] lean_d_lakes = ['Michigan', 'Wisconsin', 'Pennsylvania',] tossups = ['Florida', 'North Carolina','Maine CD2','Nebraska CD2'] # In[39]: ec = list(df.ec.values) states = list(df.state.values) cand = list(df['clinton_rcp_prob'].values) sim_election = np.random.uniform(low=0.05)*100 lean_r_west_sim = np.random.uniform()*100 lean_d_west_sim = np.random.uniform()*100 lean_r_lakes_sim = np.random.uniform()*100 lean_d_lakes_sim = np.random.uniform()*100 ec_total = 0 for x, y, z in zip(cand, states, ec): if y in swing_states: if y in lean_r_west: sim_election = lean_r_west_sim if y in lean_d_west: sim_election = lean_d_west_sim if y in lean_r_lakes: sim_election = lean_r_lakes_sim if y in lean_d_lakes: sim_election = lean_d_lakes_sim if y in tossups: sim_election = np.random.uniform()*100 if x > sim_election: print('Won',y,'for',z,'electoral college votes') ec_total += z print() print('Final Results:',ec_total,'electoral college votes') # In[40]: ec = list(df.ec.values) states = list(df.state.values) cand = list(df['trump_rcp_prob'].values) ec_total = 0 sim_election = np.random.uniform(low=0.05)*100 lean_r_west_sim = np.random.uniform()*100 lean_d_west_sim = np.random.uniform()*100 lean_r_lakes_sim = np.random.uniform()*100 lean_d_lakes_sim = np.random.uniform()*100 for x, y, z in zip(cand, states, ec): if y in swing_states: if y in lean_r_west: sim_election = lean_r_west_sim if y in lean_d_west: sim_election = lean_d_west_sim if y in lean_r_lakes: sim_election = lean_r_lakes_sim if y in lean_d_lakes: sim_election = lean_d_lakes_sim if y in tossups: sim_election = np.random.uniform()*100 if x > sim_election: print('Won',y,'for',z,'electoral college votes') ec_total += z print() print('Final Results:',ec_total,'electoral college votes') # ### Running 20,000 Simulations # In[41]: def electoral_college(ec, cand, state, sims=10): cand_wins = 0 cand_ec_total = [] cand_states = [] for i in range(sims): cand_ec = 0 cand_state = [] sim_election = np.random.uniform(low=0.05)*100 lean_r_west_sim = np.random.uniform()*100 lean_d_west_sim = np.random.uniform()*100 lean_r_lakes_sim = np.random.uniform()*100 lean_d_lakes_sim = np.random.uniform()*100 for x, y, z in zip(cand, states, ec): if y in swing_states: if y in lean_r_west: sim_election = lean_r_west_sim if y in lean_d_west: sim_election = lean_d_west_sim if y in lean_r_lakes: sim_election = lean_r_lakes_sim if y in lean_d_lakes: sim_election = lean_d_lakes_sim if y in tossups: sim_election = np.random.uniform()*100 if x > sim_election: cand_ec += z cand_state.append(y) cand_ec_total.append(cand_ec) cand_states.append(cand_state) if cand_ec > 269: cand_wins += 1 return cand_wins, cand_ec_total, cand_states # In[42]: print("Monte Carlo Simulation of Electoral College Results") print() sims = 20000 ec = list(df.ec.values) states = list(df.state.values) cand_1 = list(df['clinton_rcp_prob'].values) cand_1_wins, cand_1_ec_totals, cand_1_states = electoral_college(ec, cand_1, states, sims=sims) print('Clinton Win Prob:', (cand_1_wins/sims)*100) for i,j in Counter(cand_1_ec_totals).most_common(n=1): print('Electoral College Results: Clinton',i) print('Sim Percent Outcome:',j/20000) print() cand_2 = list(df['trump_rcp_prob'].values) cand_2_wins, cand_2_ec_totals, cand_2_states = electoral_college(ec, cand_2, states, sims=sims) print('Trump Win Prob:', (cand_2_wins/sims)*100) for i,j in Counter(cand_2_ec_totals).most_common(n=1): print('Electoral College Results: Trump',i) print('Sim Percent Outcome:',j/20000) # ### Top 5 Results for Each Candidate # In[43]: for i,j in Counter(cand_1_ec_totals).most_common(n=5): print('Electoral College Results: Clinton',i) print('Sim Percent Outcome:',j/20000) # In[44]: for i,j in Counter(cand_2_ec_totals).most_common(n=5): print('Electoral College Results: Trump',i) print('Sim Percent Outcome:',j/20000) # In[45]: plt.figure(figsize=(18,8)) plt.hist(cand_2_ec_totals,500) plt.hist(cand_1_ec_totals,500) plt.axvline(x=270, color='k', linestyle='dashed') plt.title('Simulation Results') plt.show() # ### Actual 2016 Electoral College Results # # ![final](https://raw.githubusercontent.com/ahoaglandnu/election/master/images/final.png) # # ### Conclusion # # The simulation gives us more reasonable probabilities for each candidate but does not overcorrect to replicate the 2016 Electoral College results. The polling errors in Wisconsin, Pennsylvania, and Michigan would have to have been identified **prior** to running simulations when assigning probabilities to each state. # In[ ]: