import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import seaborn as sns
from matplotlib import pyplot as plt
import bs4
import statsmodels.formula.api as smf
%matplotlib inline
#nice defaults for matplotlib
from matplotlib import rcParams
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
When was the election?
#Wikipedia page with table of election dates
theurl = "http://en.wikipedia.org/wiki/Election_Day_(United_States)"
#Grab table from page. Use "Midterm" as a unique identifier of the table
#we want. Grab the first returned dataframe.
wikitable = pd.read_html(theurl, match="Midterm", header=0)[0]
#Bingo. But note that the dates are automatically set to 2014.
print wikitable.head()
Year Day Details Type 0 2000 2014-11-07 United States elections, 2000 Presidential 1 2001 2014-11-06 United States elections, 2001 Off-year 2 2002 2014-11-05 United States elections, 2002 Midterm 3 2003 2014-11-04 United States elections, 2003 Off-year 4 2004 2014-11-02 United States elections, 2004 Presidential
def set_year(dateobj, year):
d2 = dateobj.replace(year = year)
return d2
wikitable['electionday'] = wikitable.apply(lambda r: set_year(r.values[1], r.values[0]), axis=1)
wikitable.head()
Year | Day | Details | Type | electionday | |
---|---|---|---|---|---|
0 | 2000 | 2014-11-07 | United States elections, 2000 | Presidential | 2000-11-07 |
1 | 2001 | 2014-11-06 | United States elections, 2001 | Off-year | 2001-11-06 |
2 | 2002 | 2014-11-05 | United States elections, 2002 | Midterm | 2002-11-05 |
3 | 2003 | 2014-11-04 | United States elections, 2003 | Off-year | 2003-11-04 |
4 | 2004 | 2014-11-02 | United States elections, 2004 | Presidential | 2004-11-02 |
#Grab all data
filedates = ["2002_2008", "2010", "2012"]
pollstercsvs = [pd.read_csv("pollster_cleaned_%s.csv"%fname, parse_dates=['end.date'])
for fname in filedates]
#Paste them together
oldpolls = pd.concat(pollstercsvs)
oldpolls['end.date']
oldpolls = pd.merge(oldpolls, wikitable, how="inner", left_on="year", right_on="Year")
#Filter to senate races
oldpolls = oldpolls.ix[oldpolls.race.str.contains("Sen"),:]
#Compute number of days between poll's end date and election day.
oldpolls['day'] = oldpolls.apply(lambda x: (x['end.date']-x['electionday']).days, axis=1)
#Filter down to polls taken within a year of the election.
oldpolls = oldpolls.ix[(oldpolls.day <= 0) & (oldpolls.day >= -365),:]
Deal with independents. Make them dems or republicans depending on who the best competitor is.
def grab_greater(df, focus, champ, chal):
return np.where((df[chal] > df[focus]) & (df[champ] > df[focus]),
df[chal], df[focus])
oldpolls['real.dem2'] = grab_greater(oldpolls, 'real.dem', 'real.rep', 'real.oth')
oldpolls['real.rep2'] = grab_greater(oldpolls, 'real.rep', 'real.dem', 'real.oth')
oldpolls['vote.dem2'] = grab_greater(oldpolls, 'vote.dem', 'vote.rep', 'vote.oth')
oldpolls['vote.rep2'] = grab_greater(oldpolls, 'vote.rep', 'vote.dem', 'vote.oth')
Now compute differences and bias. Here "bias" is a poll's prediction of the Republican's advantage minus the Republican's actual win margin.
oldpolls['diff'] = oldpolls['vote.rep2']-oldpolls['vote.dem2']
oldpolls['reald'] = oldpolls['real.rep2']-oldpolls['real.dem2']
oldpolls['bias'] = oldpolls['diff']-oldpolls['reald']
Note that there are some flipped values.
print oldpolls.ix[oldpolls.bias.abs() >= 50,["vote.rep2", "vote.dem2", "real.rep2", "real.dem2"]]
vote.rep2 vote.dem2 real.rep2 real.dem2 4651 32 65 63.36 36.52
Drop these polls.
oldpolls = oldpolls.ix[oldpolls.bias.abs() < 50,:]
Now make some exploratory plots. First, plot the number of polls per week per race.
#`value_counts` counts the number of times each value is repeated in a series.
day_table = (oldpolls['day']/7).round().value_counts()
#`nunique` counts the number of unique values in a series.
num_races = oldpolls['race'].nunique()
plt.bar(day_table.index, day_table/num_races)
plt.axhline(y=0.5, color='black', linestyle='--')
plt.axhline(y=1, color='black', linestyle='--')
plt.show()
cutoff = -60
oldpolls = oldpolls.ix[oldpolls.day > cutoff,:]
We might want to filter out certain unreliable polls. Explore different groupings of polls (for example, by polling company, party affiliation of the pollster, or sampled population) to see if any stick out in terms of prediction bias.
Boxplots give a good graphical exploration.
#For this analysis, consider only polls within a week of the election.
tmppolls = oldpolls.ix[oldpolls.day >= -7,]
#Compute number of polls taken by each pollster, in a format that lets you save this
#back to the original dataframe. Every poll corresponding to a given pollster will
#have the same value for 'npolls' -- `transform` creates these repeats for you.
tmppolls['npolls'] = tmppolls.groupby("pollster")['pollster'].transform(len)
#Only analyze pollsters with more than 2 polls.
tmppolls = tmppolls.ix[tmppolls.npolls >= 2,]
#Compute mean bias of each pollster. Save this back to the original dataframe.
tmppolls['mean_bias'] = tmppolls.groupby("pollster")['bias'].transform(np.mean)
#Make boxplot grouped by and sorted by mean bias.
tmppolls.boxplot("bias", by="mean_bias", rot=90)
#To get the right labels on the boxplot, recompute bias, but this time in a way
#that only returns one number per poll. `aggregate` only returns one number per
#pollster group.
pollster_bias = tmppolls.groupby("pollster")['bias'].aggregate(np.mean)
#Sort the pollster biases, then plot their indices to label the boxplot.
pollster_bias.sort()
plt.xticks(np.arange(len(pollster_bias)), pollster_bias.index, fontsize=10)
plt.ylim((-25,25))
plt.show()
We can also explore these biases using a linear regression to explain bias in terms of the categorical variable
pollster`.
import statsmodels.formula.api as smf
pollster_anova = smf.ols(formula="bias ~ pollster - 1", data=tmppolls).fit()
pollster_anova.summary()
Dep. Variable: | bias | R-squared: | 0.188 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.094 |
Method: | Least Squares | F-statistic: | 2.009 |
Date: | Sun, 26 Oct 2014 | Prob (F-statistic): | 9.19e-05 |
Time: | 18:37:17 | Log-Likelihood: | -1539.3 |
No. Observations: | 503 | AIC: | 3183. |
Df Residuals: | 451 | BIC: | 3402. |
Df Model: | 52 |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
pollster[ARG] | -4.0400 | 3.855 | -1.048 | 0.295 | -11.616 3.536 |
pollster[Angus-Reid] | 3.6800 | 2.438 | 1.509 | 0.132 | -1.111 8.471 |
pollster[BlumWeprin] | 2.5167 | 3.148 | 0.800 | 0.424 | -3.669 8.702 |
pollster[CNN] | 2.2875 | 2.726 | 0.839 | 0.402 | -3.069 7.644 |
pollster[CNN/Time] | 2.7400 | 2.438 | 1.124 | 0.262 | -2.051 7.531 |
pollster[Columbus Dispatch] | -1.1500 | 3.855 | -0.298 | 0.766 | -8.726 6.426 |
pollster[ColumbusDispatch] | -4.3450 | 3.855 | -1.127 | 0.260 | -11.921 3.231 |
pollster[Dan Jones & Associates] | -0.0500 | 3.855 | -0.013 | 0.990 | -7.626 7.526 |
pollster[EPIC] | -0.2750 | 2.726 | -0.101 | 0.920 | -5.632 5.082 |
pollster[FOX] | 4.4500 | 2.726 | 1.633 | 0.103 | -0.907 9.807 |
pollster[FairleighDickinsonUniv] | -3.4450 | 3.855 | -0.894 | 0.372 | -11.021 4.131 |
pollster[Field] | 0.5250 | 3.855 | 0.136 | 0.892 | -7.051 8.101 |
pollster[Gallup] | -0.0250 | 1.724 | -0.015 | 0.988 | -3.413 3.363 |
pollster[Ipsos/Reuters (Web)] | 0.6111 | 1.285 | 0.476 | 0.635 | -1.914 3.136 |
pollster[JZ Analytics/Newsmax] | -0.4444 | 1.817 | -0.245 | 0.807 | -4.016 3.127 |
pollster[Marist/McClatchy] | 3.8000 | 2.726 | 1.394 | 0.164 | -1.557 9.157 |
pollster[MaristCollege] | 4.0933 | 3.148 | 1.300 | 0.194 | -2.092 10.279 |
pollster[MarketingWorkshop] | -7.2900 | 3.855 | -1.891 | 0.059 | -14.866 0.286 |
pollster[Mason-Dixon] | 0.7250 | 2.726 | 0.266 | 0.790 | -4.632 6.082 |
pollster[MasonDixon] | 4.0093 | 1.012 | 3.960 | 0.000 | 2.020 5.999 |
pollster[McLaughlin (R-Mourdock)] | 7.2000 | 3.855 | 1.868 | 0.062 | -0.376 14.776 |
pollster[MinneapolistStarTribune] | -3.0820 | 2.438 | -1.264 | 0.207 | -7.873 1.709 |
pollster[Mitchell] | 3.6100 | 3.148 | 1.147 | 0.252 | -2.576 9.796 |
pollster[MonmouthUniv] | 0.5550 | 3.855 | 0.144 | 0.886 | -7.021 8.131 |
pollster[Muhlenberg] | 2.3333 | 2.226 | 1.048 | 0.295 | -2.041 6.707 |
pollster[NBC/WSJ/Marist] | 2.6667 | 3.148 | 0.847 | 0.397 | -3.519 8.852 |
pollster[OnPoint] | -1.6375 | 2.726 | -0.601 | 0.548 | -6.994 3.719 |
pollster[POS] | 5.4167 | 3.148 | 1.721 | 0.086 | -0.769 11.602 |
pollster[PPP] | 1.2230 | 1.724 | 0.709 | 0.478 | -2.165 4.611 |
pollster[PPP (D)] | 3.1200 | 0.995 | 3.135 | 0.002 | 1.164 5.076 |
pollster[Pulse Opinion Research/Let Freedom Ring (R)] | 6.8667 | 3.148 | 2.182 | 0.030 | 0.681 13.052 |
pollster[Quinnipiac] | -0.1333 | 3.148 | -0.042 | 0.966 | -6.319 6.052 |
pollster[QuinnipiacUniv] | 1.7900 | 3.148 | 0.569 | 0.570 | -4.396 7.976 |
pollster[Rasmussen] | 1.9411 | 0.909 | 2.136 | 0.033 | 0.155 3.727 |
pollster[Research2000] | -3.0276 | 1.190 | -2.545 | 0.011 | -5.366 -0.690 |
pollster[RutgersUniv] | 1.4800 | 3.855 | 0.384 | 0.701 | -6.096 9.056 |
pollster[ScrippsDataCenter] | 1.1700 | 3.855 | 0.304 | 0.762 | -6.406 8.746 |
pollster[Selzer] | 1.3150 | 2.726 | 0.482 | 0.630 | -4.042 6.672 |
pollster[Siena] | 3.3000 | 3.855 | 0.856 | 0.392 | -4.276 10.876 |
pollster[StrategicVision] | 3.0444 | 1.817 | 1.675 | 0.095 | -0.527 6.616 |
pollster[SurveyUSA] | 0.9203 | 0.896 | 1.027 | 0.305 | -0.841 2.682 |
pollster[Susquehanna] | 0.4000 | 3.148 | 0.127 | 0.899 | -5.786 6.586 |
pollster[TalmeyDrake] | -7.9300 | 3.855 | -2.057 | 0.040 | -15.506 -0.354 |
pollster[Tarrance] | 2.8050 | 3.855 | 0.728 | 0.467 | -4.771 10.381 |
pollster[The Ohio Poll/University of Cincinnati] | 1.5000 | 3.855 | 0.389 | 0.697 | -6.076 9.076 |
pollster[UNH] | -3.1320 | 2.438 | -1.285 | 0.200 | -7.923 1.659 |
pollster[UnivCincinnati] | 2.6550 | 3.855 | 0.689 | 0.491 | -4.921 10.231 |
pollster[We Ask America] | 4.4000 | 2.726 | 1.614 | 0.107 | -0.957 9.757 |
pollster[Wenzel Strategies (R-Citizens United)] | 9.2500 | 3.855 | 2.400 | 0.017 | 1.674 16.826 |
pollster[YouGov] | 1.8127 | 0.598 | 3.029 | 0.003 | 0.637 2.989 |
pollster[YouGov/Polimetrix] | 0.3000 | 1.927 | 0.156 | 0.876 | -3.488 4.088 |
pollster[Zogby] | 0.9743 | 0.595 | 1.638 | 0.102 | -0.195 2.143 |
Omnibus: | 29.210 | Durbin-Watson: | 1.448 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 96.760 |
Skew: | 0.062 | Prob(JB): | 9.75e-22 |
Kurtosis: | 5.145 | Cond. No. | 6.48 |
tmppolls.boxplot("bias", by="party")
plt.show()
party_anova = smf.ols(formula="bias ~ party - 1", data=tmppolls).fit()
party_anova.summary()
tmppolls.boxplot("bias", by="pop")
plt.show()
tmppolls.boxplot("bias", by="pop")
plt.show()
pop_anova = smf.ols(formula="bias ~ pop - 1", data=tmppolls).fit()
pop_anova.summary()
Dep. Variable: | bias | R-squared: | 0.059 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.053 |
Method: | Least Squares | F-statistic: | 10.44 |
Date: | Sun, 26 Oct 2014 | Prob (F-statistic): | 1.13e-06 |
Time: | 18:37:21 | Log-Likelihood: | -1573.5 |
No. Observations: | 502 | AIC: | 3153. |
Df Residuals: | 499 | BIC: | 3166. |
Df Model: | 3 |
coef | std err | t | P>|t| | [95.0% Conf. Int.] | |
---|---|---|---|---|---|
pop[General.Population] | 5.3100 | 5.577 | 0.952 | 0.341 | -5.647 16.267 |
pop[Likely.Voters] | 1.3089 | 0.262 | 4.990 | 0.000 | 0.794 1.824 |
pop[Registered.Voters] | 1.8731 | 0.797 | 2.351 | 0.019 | 0.308 3.438 |
Omnibus: | 35.247 | Durbin-Watson: | 1.416 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 122.184 |
Skew: | -0.187 | Prob(JB): | 2.94e-27 |
Kurtosis: | 5.388 | Cond. No. | 21.3 |
Subset down to likely voters.
oldpolls = oldpolls.ix[(~oldpolls['pop'].isnull()) & (oldpolls['pop'].str.contains("Likely")),:]
party_anova = smf.ols(formula="bias ~ party - 1", data=tmppolls).fit()
party_anova.summary()
tmppolls.boxplot("bias", by="pop")
plt.show()
The following code implements a prediction strategy that takes all of the polls for given races and generates a point prediction and predicted standard error. The model itself is fairly basic and does not take into account any of the poll aggregation concerns covered in lab. Hint: Set the plotme
variable to True
if you'd like to see the model fit for each race.
import patsy
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
#Cutoff for number of polls to perform "complex" prediction
#If fewer than minN polls, just average all polls together
#If more, use a spline fit to try to detect a trend.
minN = 15
#Plot the poll history, model fit, and error bar for each race, or not.
plotme= False
results = []
#Group the polls by race.
oldpolls_groups = oldpolls.groupby("race")
#Step through, group by group. For each loop iteration:
#race is assigned the name of the current group's race
#oldpolls_small is assigned the dataframe corresponding to that group.
for race, oldpolls_small in oldpolls_groups:
#Check whether to do the simple or spline fit
do_spline = oldpolls_small.shape[0] > minN
#Simple fit first
if not do_spline:
data = oldpolls_small['diff']
#Estimate is mean of all polls.
#Note that this ignores sample sizes, pollster quality, etc.
pred = np.mean(data)
#Standard error is the standard deviation of the sample mean
pred_se = np.sqrt(np.var(data)*(1/float(oldpolls_small.shape[0])))
#More complex fit does a spline regression
#Despite being "fancier", this approach also ignores samples sizes, pollster quality, etc.
else:
#Build a bunch of smooth functions of time. Throw these into a regression.
spline_mat = patsy.dmatrix("cr(day-1, df=4)", data=oldpolls_small)
pollfit = sm.OLS(oldpolls_small['diff'], spline_mat).fit()
#Use the same function generator to create a representation of election day.
#We will throw this into the regression model to get an estimate for election day.
predmat = patsy.build_design_matrices([spline_mat.design_info.builder],
pd.DataFrame({'day': [0]}))[0]
#Get the prediction
pred = pollfit.predict(predmat)[0]
#Predicted standard error formula for regressions, when you expect the future datapoint to have
#no sampling noise.
pred_se = np.sqrt(np.dot(predmat, np.dot(pollfit.cov_params(), predmat.T)).diagonal())[0]
#Compare to the true result
result = oldpolls_small['reald'].iloc[0]
#Construct a 99% predictive interval. Check whether it contains the true result.
pred_interval = pred+pred_se*np.array([-2.54, 2.54])
covers = result > pred_interval[0] and result < pred_interval[1]
#Optional plotting functionality
if plotme:
#Plot the polling data.
plt.scatter(oldpolls_small['day'], oldpolls_small['diff'])
#Draw a line representing the outcome.
plt.axhline(result, color='black', linestyle='--')
#Draw the model fit.
if do_spline:
order = np.array(oldpolls_small['day'].values.argsort())
plt.plot(oldpolls_small['day'].values[order], np.array(pollfit.predict())[order], color='orange')
else:
plt.plot([oldpolls['day'].min(), oldpolls['day'].max()], [pred, pred], color='orange')
#Plot the model prediction
plt.scatter(0, pred)
#Draw the 99% confidence interval
plt.plot([0,0], pred_interval)
plt.title(race)
plt.show()
#Save all of the results of the computation.
results.append((race, oldpolls_small['year'].iloc[0], pred, pred_se, covers, result, do_spline))
#Turn results into a dataframe for further analysis.
results_df = pd.DataFrame(results, columns=["Race", "Year", "Pred", "PredSE", "Covers99", "Result", "Spline"])
results_df.ix[results_df['PredSE']<0.001, 'PredSE'] = np.median(results_df['PredSE'])
Plot the performance of the predictions against the true outcomes. Break these out by year.
First, look at predicted versus actual vote share. Green points/error bars have 99% error bars that contain the actual result; orange points/error bars do not have 99% intervals that contain the result. If the model were well-calibrated, each year would have 99% coverage (percentage of predictions with 99% intervals that cover the truth). The actual coverage is in the title of each plot.
results_year = results_df.groupby("Year")
fignum = 0
for yr, year_df in results_year:
covers = year_df['Covers99']
err99 = year_df['PredSE']*2.54
#Colors for predictions that cover or miss the truth.
cover_color = dark2_colors[0]
miss_color = "darkorange"
fignum += 1
plt.subplot(2, 3, fignum)
#Plot predictions that cover the truth.
plt.errorbar(year_df.ix[covers, 'Result'],
year_df.ix[covers, 'Pred'],
yerr=err99[covers], fmt='o', c=cover_color, ecolor=cover_color)
#Plot predictions that miss.
plt.errorbar(year_df.ix[~covers, 'Result'],
year_df.ix[~covers, 'Pred'],
yerr=err99[~covers], fmt='o', c=miss_color, ecolor=miss_color)
plt.xlim((-60,60))
plt.ylim((-60,60))
plt.plot([-60,60], [-60,60], linewidth=1, color='black')
plt.title("%d: %d%%"%(yr, (np.mean(covers)*100).round()))
plt.xlabel("Actual")
plt.ylabel("Predicted + Interval")
plt.tight_layout()
plt.show()
The model is not very well-calibrated since the 99% prediction intervals contain the truth far less than 99% of the time. However, the predictions seem to track well with the truth, suggesting that our current model is overconfident and is giving predictive intervals that are too short. If we made the predictive intervals larger, then they would contain the truth closer to 99% of time time. But these intervals (ostensibly) take into account all of the variation between polls. Where is the extra variation coming from? That's what you should think about on your homework.
We can do the same sort of performance check with the binary "Dem or Republican" outcome. Compute the predictied probability of a Republican win using the normal approximation to our predictive distribution. Look at the probability that a vote percentage difference drawn from this distribution is greater than zero.
Since we saw that our predictive distributions were too narrow (resulting in low coverage) in the last part, try adding some additional variance to the predictive distribution.
from scipy.stats import norm
#`norm.cdf` returns the probability that a normal random variable is less than
#a particular value.
#Here we add 2 to the predicted distribution's standard deviation.
#What might this extra variation represent?
results_df['prob_rep_win'] = 1-norm.cdf(0, loc=results_df['Pred'], scale=results_df['PredSE']+2)
To see whether these probabilities look better calibrated, group the data by the predicted probability of a Republican win. See whether the empirical win percentage in each group seems to track with the predicted win percentage.
results_df['pred_group'] = pd.cut(results_df['prob_rep_win'], 10)
pred_groups = results_df.groupby("pred_group")
pred_prob = pred_groups['prob_rep_win'].aggregate(np.mean)
real_prob = pred_groups['Result'].aggregate(lambda x: np.mean(x > 0))
plt.scatter(pred_prob, real_prob)
plt.xlim(0,1)
plt.ylim(0,1)
plt.plot([0,1], [0,1], linewidth=1, color='black')
plt.title("Republican Win Probability")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.gca().set_aspect('equal')
Adding some variation does seem to help in predicting win percentages. There are still some real issues for the close races near 50%. Those are also the most interesting ones for a forecaster's perspective, so this is a problem. Hopefully, you can do better on the homework!