import collections
import datetime
import numpy as np
import pandas as pd
import requests
API_ENDPOINT = "http://elections.huffingtonpost.com/pollster/api/polls"
np.random.seed(2016)
def get_all_results(state='US', party='gop', start_date='2015-6-1'):
topic = '2016-president-{}-primary'.format(party)
params = {'state': state,
'after': start_date,
'topic': topic
}
page = 1
while True:
params['page'] = page
page_results = requests.get(API_ENDPOINT,
params=params).json()
for poll in page_results:
subpop = next(i['subpopulations'][0]
for i in poll['questions']
if i['topic'] == topic)
for response in subpop['responses']:
if response['first_name']:
yield {'poll': poll['id'],
'date': poll['end_date'],
'filter': subpop['name'].lower(),
'obs': subpop['observations'],
'candidate': '{} {}'.format(response['first_name'], response['last_name']),
'mean': response['value']}
if len(page_results) < 10:
break
page += 1
def get_polls(state='US', party='gop', start_date='2015-6-1'):
polls = pd.DataFrame(get_all_results(state=state, party=party, start_date=start_date))
polls['date'] = pd.to_datetime(polls['date'])
return polls
def get_distribution_for_date(polls, target_date=None, window=30):
if target_date is None:
target_date = datetime.datetime.today()
polls = polls[
(polls['date'] <= target_date)
& (polls['date'] > target_date - datetime.timedelta(window))
]
weights = 1 / np.square((target_date - polls['date']) / np.timedelta64(1, 'D') + 1)
weighted = polls[['candidate']].copy()
weighted['n'] = weights * polls['obs']
weighted['votes'] = polls['mean'] / 100 * polls['obs'] * weights
weighted = weighted.groupby('candidate').sum()
weighted['mean'] = weighted['votes'] / weighted['n']
weighted['std'] = np.sqrt((weighted['mean'] * (1 - weighted['mean'])) / weighted['n'])
return weighted[['mean', 'std']].query('mean > 0').copy()
def run_simulation(dists, trials=10000):
runs = pd.DataFrame(
[np.random.normal(dists['mean'], dists['std'])
for i in range(trials)],
columns=dists.index)
results = pd.Series(collections.Counter(runs.T.idxmax()))
return results / results.sum()
def predict(state='us', party='gop', window=30, trials=10000, target_date=None):
polls = get_polls(state=state, party=party)
dists = get_distribution_for_date(polls, window=window, target_date=target_date)
print('Superpoll Results:')
print(dists.sort_values('mean', ascending=False).applymap(lambda x: '{:.1%}'.format(x)))
print()
print('Simulation Results:')
print(run_simulation(dists, trials=trials).sort_values(ascending=False).map(lambda x: '{:.1%}'.format(x)))
target_date = datetime.datetime(2016, 2, 1)
predict(state='ia', party='gop', target_date=target_date)
Superpoll Results: mean std candidate Donald Trump 28.2% 2.0% Ted Cruz 23.6% 1.8% Marco Rubio 17.4% 1.6% Ben Carson 7.6% 1.2% Rand Paul 4.7% 0.9% Jeb Bush 4.1% 0.9% Mike Huckabee 3.3% 0.8% John Kasich 2.8% 0.7% Carly Fiorina 2.4% 0.7% Chris Christie 2.1% 0.6% Rick Santorum 1.3% 0.5% Jim Gilmore 0.1% 0.2% Simulation Results: Donald Trump 96.0% Ted Cruz 4.0% dtype: object
predict(state='ia', party='dem', target_date=target_date)
Superpoll Results: mean std candidate Hillary Clinton 47.4% 2.4% Bernie Sanders 46.0% 2.4% Martin O'Malley 3.6% 0.9% Simulation Results: Hillary Clinton 66.0% Bernie Sanders 34.0% dtype: object
predict(state='ia', party='gop', target_date=target_date, window=4)
Superpoll Results: mean std candidate Donald Trump 27.5% 2.1% Ted Cruz 23.1% 2.0% Marco Rubio 18.1% 1.9% Ben Carson 7.5% 1.3% Rand Paul 5.1% 1.1% Jeb Bush 4.1% 0.9% Mike Huckabee 3.5% 0.9% John Kasich 2.8% 0.8% Carly Fiorina 2.5% 0.7% Chris Christie 2.0% 0.7% Rick Santorum 1.3% 0.5% Simulation Results: Donald Trump 93.6% Ted Cruz 6.4% Marco Rubio 0.0% dtype: object
predict(state='ia', party='dem', target_date=target_date, window=4)
Superpoll Results: mean std candidate Hillary Clinton 47.0% 2.7% Bernie Sanders 46.9% 2.7% Martin O'Malley 3.2% 1.0% Simulation Results: Hillary Clinton 51.0% Bernie Sanders 49.0% dtype: object