%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
from pattern import web
import scipy.stats as stats
from scipy.stats import binom
from __future__ import division
import re
#nice defaults for matplotlib
from matplotlib import rcParams
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
For a list of numbers $x_1,\dots,x_n$ the variance is defined as $$\sigma^2\equiv\frac{1}{n}\sum_{i=1}^n (x_i-\mu)^2$$
p = np.arange(0,1.01,0.01)
plt.plot(p, p*(1-p))
plt.xlabel("p")
plt.ylabel("p*(1-p)")
plt.ylim(0,0.26)
plt.show()
Let's see what happens for several Ns
B = 10000
p = 0.48
N = 5000
result = [np.mean([np.random.binomial(1,p) for i in range(N)])
for j in range(B)]
plt.hist(result, bins = 40)
plt.vlines(p, 0, 890, alpha=0.8, colors=dark2_colors[1])
plt.ylabel("Frequency")
plt.xlabel("Result")
plt.show()
$$\mbox{Pr} \left( \left| \frac{\bar{X} - p}{\sqrt{\mbox{var}(\bar{X})}} \right| \leq 2\right) = 0.95$$
B = 50
N = 100
p = 0.49
def get_prange():
res = [np.random.binomial(1,p) for i in range(N)]
avg = np.mean(res)
res = avg + np.sqrt(avg*(1-avg)/N)*np.array([-2,2])
res = list(res)
res = [(res[0]<=p and res[1]>=p)*1]+res
return res
cis = np.array([get_prange() for i in range(B)])
colors = [dark2_colors[1], dark2_colors[0]]
cols = [colors[int(i)] for i in cis[:,0]]
cis = cis[:,1:3]
fig, ax = plt.subplots(1, 1)
for i in range(len(cis)):
ax.hlines(i+1, cis[i][0], cis[i][1], color=cols[i])
ax.scatter(np.mean(cis[i,:]),i+1, zorder = 2)
ax.vlines(p, 0.8,B+0.2, alpha=0.5)
plt.ylim(0.8,B+0.2)
plt.xlabel("Probabilities")
plt.ylabel("Trial")
plt.show()
URL = "http://www.pollster.com/08USPresGEMvO-2.html"
html = requests.get(URL).text
dom = web.Element(html)
rows = dom.by_tag('tr')
table = []
for row in rows:
table_row = []
data = row.by_tag('td')
for value in data:
table_row.append(web.plaintext(value.content))
table.append(table_row)
df = pd.DataFrame(table[1:], columns = table[0])
df.head()
Pollster | Dates | N/Pop | McCain | Obama | Barr | Nader | Other | Undecided | Margin | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Marist College | 11/3/08 | 804 LV | 43 | 52 | - | - | 3 | 2 | +9D |
1 | GWU (Lake/Tarrance) | 11/2-3/08 | 400 LV | 44 | 49 | - | - | - | 7 | +5D |
2 | DailyKos.com (D)/Research 2000 | 11/1-3/08 | 1100 LV | 46 | 51 | 1 | 1 | 0 | 1 | +5D |
3 | IBD/TIPP | 11/1-3/08 | 981 LV | 44 | 52 | - | - | 4 | - | +8D |
4 | Rasmussen | 11/1-3/08 | 3000 LV | 46 | 52 | - | - | - | - | +6D |
#convert to number
Obama = np.array([float(a) for a in df["Obama"].values])
McCain = np.array([float(a) for a in df["McCain"].values])
df['diff']= (Obama - McCain)/100
#extract the year
year = [my_str.split("/")[-1] for my_str in df["Dates"].values]
year = [(a=='08')*1 for a in year]
full_year = ["2007", "2008"]
year = [full_year[i] for i in year]
#extract day
dates = df["Dates"].values
day = [my_str.split("-")[0] for my_str in dates]
day = [d[0]+"/"+d[1] for d in [a.split("/") for a in day]]
day = [day[i]+"/"+year[i] for i in range(len(day))]
day = [(pd.to_datetime(a)-pd.to_datetime("11/4/2008")).days for a in day]
df["day"] = day
df["week"] = np.array([round(d/7,0) for d in day])
#Sort by polster
df = df.sort("Pollster")
#Look at the last month
new_df=df[df["day"]>-30]
new_df.head()
Pollster | Dates | N/Pop | McCain | Obama | Barr | Nader | Other | Undecided | Margin | diff | day | week | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
60 | ABC/Post | 10/18-21/08 | 1330 LV | 43 | 54 | - | - | 1 | 1 | +11D | 0.11 | -17 | -2 |
46 | ABC/Post | 10/22-25/08 | 1308 LV | 45 | 52 | - | - | 1 | 2 | +7D | 0.07 | -13 | -2 |
29 | ABC/Post | 10/26-29/08 | 1327 LV | 44 | 52 | - | - | 2 | 2 | +8D | 0.08 | -9 | -1 |
15 | ABC/Post | 10/30-11/2/08 | 2470 LV | 44 | 53 | - | - | 2 | 1 | +9D | 0.09 | -5 | -1 |
98 | ABC/Post | 10/8-11/08 | 766 LV | 43 | 53 | - | - | 2 | 2 | +10D | 0.10 | -27 | -4 |
def check_NA(n):
n = re.findall(r'[+-]?\d+.\d+',n)
if len(n)==0:
return np.nan
else:
return float(n[0])
N = np.array([check_NA(n) for n in new_df["N/Pop"].values])
leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values]
leftover = np.array([int(s) for s in leftover])
Obama = np.array([float(a) for a in new_df["Obama"].values])
avgs = Obama/100+leftover/100/2
se = np.sqrt(avgs*(1-avgs)/N)
se = np.nan_to_num(se)
x = list(set(new_df["Pollster"]))
x.sort()
idx = new_df.groupby(['Pollster']).count()["diff"].values
idx = np.array([0]+list(np.cumsum(idx))[:-1])
cols = {}
i = 0
for poll in x:
cols[poll]=dark2_colors[i]
i+=1
if i>len(dark2_colors)-1:
i=0
fig, ax = plt.subplots(1, 1)
for i in range(len(avgs)):
ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i],
color = cols[new_df["Pollster"].values[i]])
ax.scatter(i, avgs[i], zorder = 2)
ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1)
plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01)
plt.xlim(-1,len(avgs))
plt.xticks(idx,x, rotation = 90)
plt.show()
Assume poll results are observation from random variables
plt.hist(avgs, bins = np.arange(0.46,0.59, 0.01))
plt.xlabel("avgs")
plt.ylabel("Frequency")
plt.show()
z = (avgs-np.mean(avgs))/np.std(avgs)
stats.probplot(z, dist="norm", plot=plt)
plt.title("Normal Q-Q plot")
plt.show()
print "Mean:",np.mean(avgs)
print "Standard deviation:", np.std(avgs)
print "Variance^(1/2):", np.std(avgs)/np.sqrt(len(avgs))
a = np.mean(avgs)+np.array([-2,2])*np.std(avgs)/np.sqrt(len(avgs))
print "95% Confidence Interval:",a
Mean: 0.525571428571 Standard deviation: 0.0162174844757 Variance^(1/2): 0.00158266442829 95% Confidence Interval: [ 0.5224061 0.52873676]
keep = [i for i in range(len(avgs)) if avgs[i]>0.48]
df2 = new_df.iloc[keep]
idx = df2.groupby(['Pollster']).count()["diff"].values
idx = np.array([0]+list(np.cumsum(idx))[:-1])
fig, ax = plt.subplots(1, 1)
for i in range(len(avgs)):
if i in keep:
ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i],
color = cols[new_df["Pollster"].values[i]])
ax.scatter(i, avgs[i], zorder = 2)
ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1)
plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01)
plt.xlim(-1,len(avgs))
plt.xticks(idx,x, rotation = 90)
plt.show()
Let's look at last 100 days:
new_df = df[df["day"]>-100]
leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values]
leftover = np.array([int(s) for s in leftover])
Obama = np.array([float(a) for a in new_df["Obama"].values])
avgs = Obama/100+leftover/100/2
plt.scatter(new_df["day"],avgs)
plt.xlim(-100,0)
plt.xlabel("Day")
plt.ylabel("Avgs")
plt.show()
new_df["avg"] = avgs
new_df.boxplot(column = "avg", by = "week")
plt.show()
from statsmodels.formula.api import ols
new_df = new_df.copy()
new_df['week_factor'] = [str(int(w)) for w in new_df['week'].values]
lm1 = ols('diff ~ week_factor', new_df).fit()
new_df['week_resid'] = lm1.resid
new_df['pollster_mean_resid'] = new_df.groupby("Pollster")["week_resid"].transform(lambda x: x.mean())
new_df.sort("pollster_mean_resid", ascending=1, inplace=True)
new_df.boxplot(column="week_resid", by="pollster_mean_resid", rot=90)
pollster_labels = new_df.groupby("pollster_mean_resid")["Pollster"].agg(lambda x: x.iloc[0])
plt.xticks(np.arange(len(pollster_labels))+1, pollster_labels.values)
plt.show()
lm2 = ols('diff ~ Pollster', new_df).fit()
new_df['pollster_resid'] = lm2.resid
new_df['week_mean_resid'] = new_df.groupby("week")["pollster_resid"].transform(lambda x: x.mean())
new_df.sort("week_mean_resid", ascending=1, inplace=True)
new_df.boxplot(column="pollster_resid", by="week", rot=90)
plt.show()
lm3 = ols('diff ~ week_factor + Pollster', new_df).fit()
from statsmodels.stats.anova import anova_lm
print anova_lm(lm3)
df sum_sq mean_sq F PR(>F) week_factor 14 0.197498 0.014107 21.787072 3.323912e-34 Pollster 38 0.071579 0.001884 2.909165 5.379619e-07 Residual 220 0.142448 0.000647 NaN NaN
LOWESS (LOcally Weighted Scatterplot Smoothing, aka LOESS) works by fitting a line to windows around a point, then stitches these local fits together.
plt.scatter(new_df["day"],new_df["Obama"], color='blue', marker='o', facecolors='none')
plt.scatter(new_df["day"],new_df["McCain"], color='red', marker='v', facecolors='none')
plt.xlim(-100,0)
plt.xlabel("Day")
plt.ylabel("Avgs")
from statsmodels.nonparametric.api import lowess
#Smooth using 1/10 of the total data surrounding a point for the local fit.
smoothed = lowess(new_df['Obama'], new_df['day'], frac=0.1)
plt.plot(smoothed[:,0], smoothed[:,1], color='blue')
smoothed = lowess(new_df['McCain'], new_df['day'], frac=0.1)
plt.plot(smoothed[:,0], smoothed[:,1], color="red")
plt.show()
#plt.plot(lowess(new_df['diff'], new_df['week']))