# special IPython command to prepare the notebook for matplotlib
%matplotlib inline
import numpy as np
import scipy
import pandas as pd # pandas
import matplotlib.pyplot as plt # module for plotting
from mpl_toolkits.mplot3d import Axes3D #3D plotting
import datetime as dt # module for manipulating dates and times
import requests
import scipy.stats as stats
import statsmodels.api as sm
from scipy.stats import binom
from __future__ import division
import re
from StringIO import StringIO
from zipfile import ZipFile
from pandas import read_csv
from urllib import urlopen
import urllib2
import json
import sklearn
import sklearn.preprocessing
import sklearn.datasets
#nice defaults for matplotlib
from matplotlib import rcParams
dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
(0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
(0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
(0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
(0.4, 0.6509803921568628, 0.11764705882352941),
(0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
(0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
(0.4, 0.4, 0.4)]
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
rcParams['axes.color_cycle'] = dark2_colors
rcParams['lines.linewidth'] = 2
rcParams['axes.grid'] = True
rcParams['axes.facecolor'] = '#eeeeee'
rcParams['font.size'] = 14
rcParams['patch.edgecolor'] = 'none'
If we select random person and they test postive what is probability of positive test?
We write this as $\mbox{Prob}(D|+)?$
cystic fibrosis rate is 1 in 3,900, $\mbox{Prob}(D)=0.0025$
José Iglesias 2013
Month | At Bats | H | AVG |
---|---|---|---|
April | 20 | 9 | .450 |
What is your prediction for his average in October?
Note: No one has finished a season batting .400 since Ted Williams in 1941!
This is for all players (>500 AB) 2010, 2011, 2012
zip_folder = requests.get('http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip').content
zip_files = StringIO()
zip_files.write(zip_folder)
csv_files = ZipFile(zip_files)
players = csv_files.open('Batting.csv')
players = read_csv(players)
players.head()
playerID | yearID | stint | teamID | lgID | G | G_batting | AB | R | H | 2B | 3B | HR | RBI | SB | CS | BB | SO | IBB | HBP | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | aardsda01 | 2004 | 1 | SFN | NL | 11 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
1 | aardsda01 | 2006 | 1 | CHN | NL | 45 | 43 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
2 | aardsda01 | 2007 | 1 | CHA | AL | 25 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
3 | aardsda01 | 2008 | 1 | BOS | AL | 47 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... |
4 | aardsda01 | 2009 | 1 | SEA | AL | 73 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... |
5 rows × 24 columns
dat = players[(players.yearID >= 2010) & (players.yearID <= 2012)]
dat['AVG'] = dat['H']/dat['AB']
dat = dat[dat['AB']>500]
df = dat[['yearID', 'AVG']].dropna()
rcParams['figure.figsize'] = (14, 4)
plt.figure(1)
for i in range(1,4):
plt.subplot(1,3,i)
plt.hist(df[df.yearID == (2009+i)].AVG.values*1000, bins = np.arange(200,370,20), normed = True)
plt.title(str(2009+i))
plt.xlabel("AVG")
if i==1:
plt.ylabel("Frequency")
plt.show()
Average is .275 and SD is 0.027
incremental:true
Should we trade him?
What is the SE of our estimate?
Confidence interval?
.450-.222 to .450+.222 = .228 to .672
Pick a random player, then what is their batting average
$$\begin{eqnarray*} \theta &\sim& N(\mu, \tau^2) \mbox{ is called a prior}\\ Y | \theta &\sim& N(\theta, \sigma^2) \mbox{ is called a sampling distribution} \end{eqnarray*}$$Two levels of variability:
Variables:
Here are the equations with our data
$$\begin{eqnarray*} \theta &\sim& N(.275, .027^2) \\ Y | \theta &\sim& N(\theta, .110^2) \end{eqnarray*}$$The continuous version of Bayes rule can be used here
$$ \begin{eqnarray*} f_{\theta|Y}(\theta|Y)&=&\frac{f_{Y|\theta}(Y|\theta) f_{\theta}(\theta)}{f_Y(Y)}\\ &=&\frac{f_{Y|\theta}(Y|\theta) f_{\theta}(\theta)}{\int_{\theta}f_{Y|\theta}(Y|\theta)f_{\theta}(\theta)}\\ \end{eqnarray*} $$We are particularly interested in the $\theta$ that maximizes $f_{\theta|Y}(\theta|Y)$.
In our case, these can be shown to be normal so we want the average $\mbox{E}(\theta|y)$
We can show the average of this distribution is the following:
$$ \begin{eqnarray*} \mbox{E}(\theta|y) &=& B \mu + (1-B) Y\\ &=& \mu + (1-B)(Y-\mu)\\ B &=& \frac{\sigma^2}{\sigma^2+\tau^2} \end{eqnarray*} $$In the case of José Iglesias, we have:
$$ \begin{eqnarray*} E(\theta | Y=.450) &=& B \times .275 + (1 - B) \times .450 \\ &=& .275 + (1 - B)(.450 - .260) \\ B &=&\frac{.110^2}{.110^2 + .027^2} = 0.943\\ E(\theta | Y=450) &\approx& .285\\ \end{eqnarray*} $$The variance can be shown to be:
$$ \begin{eqnarray*} \mbox{var}(\theta|y) &=& \frac{1}{1/\sigma^2+1/\tau^2} &=& \frac{1}{1/.110^2 + 1/.027^2} \end{eqnarray*} $$In our example the SD is 0.026
Month | At Bat | Hits | AVG |
---|---|---|---|
April | 20 | 9 | .450 |
May | 26 | 11 | .423 |
June | 86 | 34 | .395 |
July | 83 | 17 | .205 |
August | 85 | 25 | .294 |
September | 50 | 10 | .200 |
Total w/o April | 330 | 97 | .293 |
Frequentist confidence interval = .450 $\pm$ 0.220
Empirical Bayes credible interval = .285 $\pm$ 0.052
Actual = .293
The US Senate has 100 senators
Currently we have 55 Dems (includes 2 independents) versus 45 Republicans.
This election 36 Senate are being contested
21 are held by Democrats and 15 by Republicans.
stream = urllib2.urlopen('http://elections.huffingtonpost.com/pollster/api/charts/?topic=2014-senate')
data = json.load(stream)
states = [str(poll['state']) for poll in data]
### use google to figure out why 2 candidate for 3 states
## you will have to do this again each time database updated!
import collections
counter=collections.Counter(states)
twopollindex = [key for key in counter.keys() if counter[key]>1]
twopollindex
['NH', 'GA', 'OK', 'SC']
###NH: there are polls with Bass but he withdrew
for poll in data:
if str(poll['state']) in twopollindex:
print poll['title']
2014 New Hampshire Senate: Brown vs. Shaheen 2014 New Hampshire Senate: Bass vs. Shaheen 2014 Georgia Senate: Perdue vs. Nunn 2014 South Carolina Senate: Graham vs. Hutto 2014 South Carolina Senate: Scott vs. Dickerson 2014 Oklahoma Senate: Inhofe vs. Silverstein 2014 Oklahoma Senate: Lankford vs. Johnson 2014 Georgia Senate Runoff
## conclusiions after googling
## In NH Bass withdrew take out 6
## In Oklahoma and SC there are actually 2. Both special elections
data.pop(6)
{u'election_date': u'2014-11-04', u'estimates': [], u'last_updated': u'2013-10-28T17:33:36Z', u'poll_count': 3, u'short_title': u'NH Senate: Bass vs. Shaheen', u'slug': u'2014-new-hampshire-senate-bass-vs-shaheen', u'state': u'NH', u'title': u'2014 New Hampshire Senate: Bass vs. Shaheen', u'topic': u'2014-senate', u'url': u'http://elections.huffingtonpost.com/pollster/2014-new-hampshire-senate-bass-vs-shaheen'}
polls = [read_csv(urlopen(poll['url']+'.csv')) for poll in data]
states = [str(poll['state']) for poll in data]
#make a table of candidates
R = np.zeros(len(polls))
R.fill(np.nan)
D = np.zeros(len(polls))
D.fill(np.nan)
incumbent = np.zeros(len(polls))
incumbent.fill(np.nan)
d = {'states':states, 'R':R, 'D':D, 'incumbent':incumbent}
candidates = pd.DataFrame(data = d)
candidates = candidates[['states','R','D','incumbent']]
for i in range(len(data)):
x = data[i]['estimates'][0:2]
if not x[0]['last_name']:
tmp = data[i]['url'].split('-')+['vs']
j = tmp.index('vs')
if j!=len(tmp)-1:
candidates.R[i] = tmp[j-1].capitalize()
candidates.D[i] = tmp[j+1].capitalize()
candidates.incumbent[i] = np.nan
#if no data means race is decided
else:
tmp = [x[0]['party'],x[1]['party']]
candidates.R[i] = x[tmp.index('Rep')]['last_name']
idx = [k for k in range(len(tmp)) if tmp[k]!='Rep'][0]
candidates['D'][i] = x[idx]['last_name']
tmp2 = [x[0]['incumbent'],x[1]['incumbent']]
tmp2+=[True]
if tmp2.index(True)!=2:
candidates.incumbent[i] = tmp[tmp2.index(True)]
#remove second last name
candidates.R = [candidate.split(' ')[-1] for candidate in candidates.R]
candidates.head(5)
states | R | D | incumbent | |
---|---|---|---|---|
0 | KY | McConnell | Grimes | Rep |
1 | AR | Cotton | Pryor | Dem |
2 | MI | Land | Peters | NaN |
3 | LA | Cassidy | Landrieu | Dem |
4 | NH | Brown | Shaheen | Dem |
5 rows × 4 columns
diff = []
for i in range(len(polls)):
tmp = polls[i]
diff.append(tmp[candidates.R[i]].values-tmp[candidates.D[i]].values)
means = [np.mean(a) for a in diff]
ses = [np.std(a)/np.sqrt(len(a)) for a in diff]
ord_means, o = (list(x) for x in zip(*sorted(zip(means, range(len(means))))))
States = states[:]
for i in range(len(States)):
if States[i] in States[:i]:
States[i] +='2'
difference = [means[i] for i in o]
se = [ses[i] for i in o]
States = [States[i] for i in o]
d = {'se':se, 'states':States, 'difference':difference}
df = pd.DataFrame(data=d)
rcParams['figure.figsize'] = (12, 6)
plt.errorbar(range(len(States)), difference, yerr = 2*np.array(se), linestyle="None", zorder = 0)
plt.scatter(range(len(States)), difference, zorder = 1)
plt.hlines(0, -0.5, 35.5, alpha = 0.5, linewidth=1, zorder = 0)
plt.xlim(-0.5,35.5)
plt.ylabel('difference')
plt.xticks(range(len(States)), States, rotation = 50)
plt.show()
** Note: Kansas, New Hampshire and North Carolina **
ord_diff = [diff[i] for i in o]
plt.boxplot(ord_diff)
plt.xticks(np.arange(1,len(States)+1,1), States, rotation = 50)
plt.show()
def plot_pollsters(state):
idx = states.index(state)
pollsters = list(polls[idx]['Pollster'].values)
unique_polls = [pollsters[i] for i in range(len(pollsters)) if pollsters[i] not in pollsters[:i]]
xvals = [unique_polls.index(poll) for poll in pollsters]
plt.scatter(xvals,diff[idx], s=50)
plt.xticks(range(max(xvals)+1),unique_polls, rotation = 90)
plt.hlines(0, -0.1, max(xvals), alpha = 0.7, linewidth=1, zorder = 0, linestyles = '--')
plt.xlim(-0.2, max(xvals)+0.2)
plt.show()
plot_pollsters('KS')
plot_pollsters('NH')
plot_pollsters('NC')
i = states.index('NC')
pollster = polls[i]['Pollster'].values
day = [pd.to_datetime(x) for x in polls[i]['Start Date'].values]
dif = polls[i][candidates.R[i]] - polls[i][candidates.D[i]]
m = np.mean(dif)
s = np.std(dif)/np.sqrt(len(dif))
print [round(m-2*s,1),round(m+2*s,1)]
[-3.9, -1.8]
Using only recent polls:
'''
ind <- which(day>"2014-08-01")
m <- mean(dif[ind])
s <- sd(dif[ind])/sqrt(length(ind))
cat("[",round(m-2*s,1),round(m+2*s,1),"]")
'''
day0 = pd.to_datetime("2014-08-01")
ind = [i for i in range(len(day)) if day[i]>day0]
m = np.mean(dif[ind])
s = np.std(dif[ind])/np.sqrt(len(ind))
print [round(m-2*s,1),round(m+2*s,1)]
[-3.2, -1.2]
Use, previous North Carolina results, average 0 and SD 5
Two level model: $$\begin{eqnarray*} \theta &\sim& N(\mu, \tau^2) \\ Y | \theta &\sim& N(\theta, 0.5^2) \end{eqnarray*}$$
Observed $Y = -2.4$
How do we construct a prior? Are there other levels?