In [1]:
import collections
import datetime

import numpy as np
import pandas as pd
import requests

API_ENDPOINT = "http://elections.huffingtonpost.com/pollster/api/polls"

np.random.seed(2016)
In [2]:
def get_all_results(state='US', party='gop', start_date='2015-6-1'):
    topic = '2016-president-{}-primary'.format(party)
    params = {'state': state,
              'after': start_date,
              'topic': topic
             }
    page = 1
    while True:
        params['page'] = page
        page_results = requests.get(API_ENDPOINT,
                                    params=params).json()
        for poll in page_results:
            subpop = next(i['subpopulations'][0]
                          for i in poll['questions']
                          if i['topic'] == topic)
            for response in subpop['responses']:
                if response['first_name']:
                    yield {'poll': poll['id'],
                           'date': poll['end_date'],
                           'filter': subpop['name'].lower(),
                           'obs': subpop['observations'],
                           'candidate': '{} {}'.format(response['first_name'], response['last_name']),
                           'mean': response['value']}

        if len(page_results) < 10:
            break
        page += 1
In [3]:
def get_polls(state='US', party='gop', start_date='2015-6-1'):
    polls = pd.DataFrame(get_all_results(state=state, party=party, start_date=start_date))
    polls['date'] = pd.to_datetime(polls['date'])
    return polls
In [4]:
def get_distribution_for_date(polls, target_date=None, window=30):
    if target_date is None:
        target_date = datetime.datetime.today()
    polls = polls[
        (polls['date'] <= target_date)
        & (polls['date'] > target_date - datetime.timedelta(window))
    ]
    weights = 1 / np.square((target_date - polls['date']) / np.timedelta64(1, 'D') + 1)
    weighted = polls[['candidate']].copy()
    weighted['n'] = weights * polls['obs']
    weighted['votes'] = polls['mean'] / 100 * polls['obs'] * weights
    weighted = weighted.groupby('candidate').sum()
    weighted['mean'] = weighted['votes'] / weighted['n']
    weighted['std'] = np.sqrt((weighted['mean'] * (1 - weighted['mean'])) / weighted['n'])
    return weighted[['mean', 'std']].query('mean > 0').copy()
In [5]:
def run_simulation(dists, trials=10000):
    runs = pd.DataFrame(
        [np.random.normal(dists['mean'], dists['std'])
         for i in range(trials)],
        columns=dists.index)
    results = pd.Series(collections.Counter(runs.T.idxmax()))
    return results / results.sum()
In [6]:
def predict(state='us', party='gop', window=30, trials=10000, target_date=None):
    polls = get_polls(state=state, party=party)
    dists = get_distribution_for_date(polls, window=window, target_date=target_date)
    print('Superpoll Results:')
    print(dists.sort_values('mean', ascending=False).applymap(lambda x: '{:.1%}'.format(x)))
    print()
    print('Simulation Results:')
    print(run_simulation(dists, trials=trials).sort_values(ascending=False).map(lambda x: '{:.1%}'.format(x)))
In [7]:
target_date = datetime.datetime(2016, 2, 1)
predict(state='ia', party='gop', target_date=target_date)
Superpoll Results:
                 mean   std
candidate                  
Donald Trump    28.2%  2.0%
Ted Cruz        23.6%  1.8%
Marco Rubio     17.4%  1.6%
Ben Carson       7.6%  1.2%
Rand Paul        4.7%  0.9%
Jeb Bush         4.1%  0.9%
Mike Huckabee    3.3%  0.8%
John Kasich      2.8%  0.7%
Carly Fiorina    2.4%  0.7%
Chris Christie   2.1%  0.6%
Rick Santorum    1.3%  0.5%
Jim Gilmore      0.1%  0.2%

Simulation Results:
Donald Trump    96.0%
Ted Cruz         4.0%
dtype: object
In [8]:
predict(state='ia', party='dem', target_date=target_date)
Superpoll Results:
                  mean   std
candidate                   
Hillary Clinton  47.4%  2.4%
Bernie Sanders   46.0%  2.4%
Martin O'Malley   3.6%  0.9%

Simulation Results:
Hillary Clinton    66.0%
Bernie Sanders     34.0%
dtype: object
In [9]:
predict(state='ia', party='gop', target_date=target_date,  window=4)
Superpoll Results:
                 mean   std
candidate                  
Donald Trump    27.5%  2.1%
Ted Cruz        23.1%  2.0%
Marco Rubio     18.1%  1.9%
Ben Carson       7.5%  1.3%
Rand Paul        5.1%  1.1%
Jeb Bush         4.1%  0.9%
Mike Huckabee    3.5%  0.9%
John Kasich      2.8%  0.8%
Carly Fiorina    2.5%  0.7%
Chris Christie   2.0%  0.7%
Rick Santorum    1.3%  0.5%

Simulation Results:
Donald Trump    93.6%
Ted Cruz         6.4%
Marco Rubio      0.0%
dtype: object
In [10]:
predict(state='ia', party='dem', target_date=target_date, window=4)
Superpoll Results:
                  mean   std
candidate                   
Hillary Clinton  47.0%  2.7%
Bernie Sanders   46.9%  2.7%
Martin O'Malley   3.2%  1.0%

Simulation Results:
Hillary Clinton    51.0%
Bernie Sanders     49.0%
dtype: object