Variance and variability

Date: September 30

Copyright (c) 2014 Rafael Irizarry MIT License

In [1]:
%matplotlib inline 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
from pattern import web
import scipy.stats as stats
from scipy.stats import binom
from __future__ import division
import re

#nice defaults for matplotlib
from matplotlib import rcParams

dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
                (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
                (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
                (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
                (0.4, 0.6509803921568628, 0.11764705882352941),
                (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
                (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
                (0.4, 0.4, 0.4)]

rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'

Variance and standard deviation

For a list of numbers $x_1,\dots,x_n$ the variance is defined as $$\sigma^2\equiv\frac{1}{n}\sum_{i=1}^n (x_i-\mu)^2$$

  • $\mu$ is the average of $X$
  • The standard deviation (SD) is the square root: $\sigma$
  • Note: the SD has the same units of $x$
  • If $x$ is approximately normal then $\mu$ and $\sigma$ are a sufficient summarize

Random Variables

  • Suppose now $X$ is a random variable
  • Specifically, a binary outcome: $\mbox{Prob}(X=1)=p$
  • What is the variance of $X$?
  • $\mu = p\times 1 + (1-p)\times 0 = p$
  • $\sigma^2 = (1-p)\times(0-p)^2 + p\times(1-p)^2= p\times(1-p)$
In [2]:
p = np.arange(0,1.01,0.01)
plt.plot(p, p*(1-p))
plt.xlabel("p")
plt.ylabel("p*(1-p)")
plt.ylim(0,0.26)
plt.show()

Average of Random Variables

  • Now we take a poll
  • We observe $N$ independent outcomes of this random variable.
  • Take the average $\bar{X}$ of these $N$ random variables.
  • In the poll context, what is $\bar{X}$ and what is $p$
  • Which is random?

Mean of sample average

  • We use $\mbox{E}(X)$ as shorthand
  • We already saw that $\mbox{E}(X_i)=p$ for any $i$
  • Rules for Mean:
    • Mean of sum is sum of means
    • Scale changes are preserved

$$\mbox{E}(\bar{X})=\frac{1}{N}\sum_{i=1}^N p = p$$

Variance of sample average

  • We use $\mbox{var}(X)$ as shorthand
  • We already saw that $\mbox{var}(X_i)=p(1-p)$ for any $i$
  • Rule for variance:
    • if independnet variance of sum is sum of variances
    • scales are squared $$\mbox{var}(\bar{X})=\frac{1}{N^2} \sum_{i=1}^N p(1-p) = \frac{p(1-p)}{N}$$

Law of large numbers

  • $\bar{X}$ has mean $p$ and standard error $\sqrt{\frac{p(1-p)}{N}}$
  • This means that $\bar{X}$ converges to $p$
  • What is the implication for polls?
  • What is the standard error of a poll with sample size $1,000$

Monte Carlo Simulation

Let's see what happens for several Ns

In [3]:
B = 10000
p = 0.48
N = 5000
result = [np.mean([np.random.binomial(1,p) for i in range(N)]) 
          for j in range(B)]
In [4]:
plt.hist(result, bins = 40)
plt.vlines(p, 0, 890, alpha=0.8, colors=dark2_colors[1])
plt.ylabel("Frequency")
plt.xlabel("Result")
plt.show()

Central limit theorem

  • Tells us $\bar{X}$ is normal.
  • We can give confidence intervals based on $$\mbox{Pr} \left( \left| \frac{\bar{X} - p}{\sqrt{\mbox{var}(\bar{X})}} \right| \leq 2\right) = 0.95$$
In [5]:
B = 50
N = 100
p = 0.49
def get_prange():
    res = [np.random.binomial(1,p) for i in range(N)]
    avg = np.mean(res)
    res = avg + np.sqrt(avg*(1-avg)/N)*np.array([-2,2])
    res = list(res)
    res = [(res[0]<=p and res[1]>=p)*1]+res
    return res
In [6]:
cis = np.array([get_prange() for i in range(B)])
colors = [dark2_colors[1], dark2_colors[0]]
cols = [colors[int(i)] for i in cis[:,0]]
cis = cis[:,1:3]
In [7]:
fig, ax = plt.subplots(1, 1)
for i in range(len(cis)):
    ax.hlines(i+1, cis[i][0], cis[i][1], color=cols[i])
    ax.scatter(np.mean(cis[i,:]),i+1, zorder = 2)
ax.vlines(p, 0.8,B+0.2, alpha=0.5)
plt.ylim(0.8,B+0.2)
plt.xlabel("Probabilities")
plt.ylabel("Trial")
plt.show()

Poll data

In [8]:
URL = "http://www.pollster.com/08USPresGEMvO-2.html"
html = requests.get(URL).text
dom = web.Element(html)
rows = dom.by_tag('tr')

table = []
for row in rows:
    table_row = []
    data = row.by_tag('td')
    for value in data:
        table_row.append(web.plaintext(value.content))
    table.append(table_row)
In [9]:
df = pd.DataFrame(table[1:], columns = table[0])
df.head()
Out[9]:
Pollster Dates N/Pop McCain Obama Barr Nader Other Undecided Margin
0 Marist College 11/3/08 804 LV 43 52 - - 3 2 +9D
1 GWU (Lake/Tarrance) 11/2-3/08 400 LV 44 49 - - - 7 +5D
2 DailyKos.com (D)/Research 2000 11/1-3/08 1100 LV 46 51 1 1 0 1 +5D
3 IBD/TIPP 11/1-3/08 981 LV 44 52 - - 4 - +8D
4 Rasmussen 11/1-3/08 3000 LV 46 52 - - - - +6D
In [10]:
#convert to number
Obama = np.array([float(a) for a in df["Obama"].values])
McCain = np.array([float(a) for a in df["McCain"].values])
df['diff']= (Obama - McCain)/100

#extract the year
year = [my_str.split("/")[-1] for my_str in df["Dates"].values]
year = [(a=='08')*1 for a in year]
full_year = ["2007", "2008"]
year = [full_year[i] for i in year]

#extract day
dates = df["Dates"].values
day = [my_str.split("-")[0] for my_str in dates]
day = [d[0]+"/"+d[1] for d in [a.split("/") for a in day]]
day = [day[i]+"/"+year[i] for i in range(len(day))]
day = [(pd.to_datetime(a)-pd.to_datetime("11/4/2008")).days for a in day]
df["day"] = day
df["week"] = np.array([round(d/7,0) for d in day])

#Sort by polster
df = df.sort("Pollster")

#Look at the last month
new_df=df[df["day"]>-30]
In [11]:
new_df.head()
Out[11]:
Pollster Dates N/Pop McCain Obama Barr Nader Other Undecided Margin diff day week
60 ABC/Post 10/18-21/08 1330 LV 43 54 - - 1 1 +11D 0.11 -17 -2
46 ABC/Post 10/22-25/08 1308 LV 45 52 - - 1 2 +7D 0.07 -13 -2
29 ABC/Post 10/26-29/08 1327 LV 44 52 - - 2 2 +8D 0.08 -9 -1
15 ABC/Post 10/30-11/2/08 2470 LV 44 53 - - 2 1 +9D 0.09 -5 -1
98 ABC/Post 10/8-11/08 766 LV 43 53 - - 2 2 +10D 0.10 -27 -4
In [12]:
def check_NA(n):
    n = re.findall(r'[+-]?\d+.\d+',n)
    if len(n)==0:
        return np.nan
    else:
        return float(n[0])
N = np.array([check_NA(n) for n in new_df["N/Pop"].values])
In [13]:
leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values]
leftover = np.array([int(s) for s in leftover])
Obama = np.array([float(a) for a in new_df["Obama"].values])
avgs = Obama/100+leftover/100/2
se = np.sqrt(avgs*(1-avgs)/N)
se = np.nan_to_num(se)
In [14]:
x = list(set(new_df["Pollster"]))
x.sort()
idx = new_df.groupby(['Pollster']).count()["diff"].values
idx = np.array([0]+list(np.cumsum(idx))[:-1])
cols = {}
i = 0
for poll in x:
    cols[poll]=dark2_colors[i]
    i+=1
    if i>len(dark2_colors)-1:
        i=0
fig, ax = plt.subplots(1, 1)
for i in range(len(avgs)):
    ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i], 
              color = cols[new_df["Pollster"].values[i]])
    ax.scatter(i, avgs[i], zorder = 2)
ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1)
plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01)
plt.xlim(-1,len(avgs))
plt.xticks(idx,x, rotation = 90)
plt.show()

Modelling Polls

Assume poll results are observation from random variables

  • Let $X$ be outcome of 105 polls.
  • What is the distribution of $X$?
In [15]:
plt.hist(avgs, bins = np.arange(0.46,0.59, 0.01))
plt.xlabel("avgs")
plt.ylabel("Frequency")
plt.show()
In [16]:
z = (avgs-np.mean(avgs))/np.std(avgs)
stats.probplot(z, dist="norm", plot=plt)
plt.title("Normal Q-Q plot")
plt.show()

Average of averages

  • What is variance of $X$?
  • We can get an idea from the sample: $$\hat{\sigma}^2=\frac{1}{N-1}\sum_{i=1}^N (X_i-\bar{X})^2$$
  • With $\bar{X}$ the average
  • What is the new model $\bar{X}$?

Using CLT

  • $\bar{X}$ is Normal with mean
  • $\mbox{E}(\bar{X})=p$
  • $\mbox{var}({\bar{X}})= \hat{\sigma}^2/N$
In [17]:
print "Mean:",np.mean(avgs)
print "Standard deviation:", np.std(avgs)
print "Variance^(1/2):", np.std(avgs)/np.sqrt(len(avgs))
a = np.mean(avgs)+np.array([-2,2])*np.std(avgs)/np.sqrt(len(avgs))
print "95% Confidence Interval:",a
Mean: 0.525571428571
Standard deviation: 0.0162174844757
Variance^(1/2): 0.00158266442829
95% Confidence Interval: [ 0.5224061   0.52873676]

Is the model correct?

In [18]:
keep = [i for i in range(len(avgs)) if avgs[i]>0.48]
In [19]:
df2 = new_df.iloc[keep]
idx = df2.groupby(['Pollster']).count()["diff"].values
idx = np.array([0]+list(np.cumsum(idx))[:-1])
fig, ax = plt.subplots(1, 1)
for i in range(len(avgs)):
    if i in keep:
        ax.vlines(i, avgs[i]-2*se[i], avgs[i]+2*se[i], 
              color = cols[new_df["Pollster"].values[i]])
        ax.scatter(i, avgs[i], zorder = 2)
ax.hlines(.529, 0, len(avgs), alpha=0.4, zorder = 1)
plt.ylim(np.min(avgs-2*se)-0.01, np.max(avgs+2*se)+0.01)
plt.xlim(-1,len(avgs))
plt.xticks(idx,x, rotation = 90)
plt.show()

Time effect

Let's look at last 100 days:

In [20]:
new_df = df[df["day"]>-100]
leftover = [re.sub(r"-", "0", s) for s in new_df["Undecided"].values]
leftover = np.array([int(s) for s in leftover])
Obama = np.array([float(a) for a in new_df["Obama"].values])
avgs = Obama/100+leftover/100/2
In [21]:
plt.scatter(new_df["day"],avgs)
plt.xlim(-100,0)
plt.xlabel("Day")
plt.ylabel("Avgs")
plt.show()

By week stratification

In [22]:
new_df["avg"] = avgs
new_df.boxplot(column = "avg", by = "week")
plt.show()

Analysis of variance

In [23]:
from statsmodels.formula.api import ols
new_df = new_df.copy()
new_df['week_factor'] = [str(int(w)) for w in new_df['week'].values]
lm1 = ols('diff ~ week_factor', new_df).fit()
new_df['week_resid'] = lm1.resid
new_df['pollster_mean_resid'] = new_df.groupby("Pollster")["week_resid"].transform(lambda x: x.mean())
new_df.sort("pollster_mean_resid", ascending=1, inplace=True)
new_df.boxplot(column="week_resid", by="pollster_mean_resid", rot=90)
pollster_labels = new_df.groupby("pollster_mean_resid")["Pollster"].agg(lambda x: x.iloc[0])
plt.xticks(np.arange(len(pollster_labels))+1, pollster_labels.values)
plt.show()
In [24]:
lm2 = ols('diff ~ Pollster', new_df).fit()
new_df['pollster_resid'] = lm2.resid
new_df['week_mean_resid'] = new_df.groupby("week")["pollster_resid"].transform(lambda x: x.mean())
new_df.sort("week_mean_resid", ascending=1, inplace=True)
new_df.boxplot(column="pollster_resid", by="week", rot=90)

plt.show()
In [25]:
lm3 = ols('diff ~ week_factor + Pollster', new_df).fit()
from statsmodels.stats.anova import anova_lm

print anova_lm(lm3)
              df    sum_sq   mean_sq          F        PR(>F)
week_factor   14  0.197498  0.014107  21.787072  3.323912e-34
Pollster      38  0.071579  0.001884   2.909165  5.379619e-07
Residual     220  0.142448  0.000647        NaN           NaN

Smoothing

LOWESS (LOcally Weighted Scatterplot Smoothing, aka LOESS) works by fitting a line to windows around a point, then stitches these local fits together.

In [26]:
plt.scatter(new_df["day"],new_df["Obama"], color='blue', marker='o', facecolors='none')
plt.scatter(new_df["day"],new_df["McCain"], color='red', marker='v', facecolors='none')
plt.xlim(-100,0)
plt.xlabel("Day")
plt.ylabel("Avgs")

from statsmodels.nonparametric.api import lowess
#Smooth using 1/10 of the total data surrounding a point for the local fit.
smoothed = lowess(new_df['Obama'], new_df['day'], frac=0.1)
plt.plot(smoothed[:,0], smoothed[:,1], color='blue')

smoothed = lowess(new_df['McCain'], new_df['day'], frac=0.1)
plt.plot(smoothed[:,0], smoothed[:,1], color="red")
plt.show()
#plt.plot(lowess(new_df['diff'], new_df['week']))