import pandas as pd, numpy as np, matplotlib.pyplot as plt, math, statsmodels.api as sm
%matplotlib inline
plt.style.use('ggplot')
from ffball_getData import ffball_data
QB_data = ffball_data('QB')
#for weeks in range(2,11):
# QB_data.update(weeks)
#QB_data.save_df()
Change coding of bye week; one hot encoding for home-away; interaction effects; remove no opponent; code teams for prediction by looping through team names; try a gamma link; yahoo?
#axs = pd.tools.plotting.scatter_matrix(QB_data.ix[:,-8:], figsize=(6,6), diagonal='kde');
#[ax.set_yticks([]) for ax in axs[:,0]] #turn off the ticks that take up way too much space in such a crammed figure
#[ax.set_xticks([]) for ax in axs[-1,:]];
So I want past performance to influence current predictions and both of them to influence the predicted variable. Is this possible? I am not sure about the time thing. It seems like it should be part of the model, but I am uncertain about how to include it. I should also probably have the matchup in the model. There is nothing about matchup in this model and that would be helpful. How to get that data and how exactly to give it to the model is a mystery...
Ok, then there is the thing of making player specific predictions, but hopefully every prediction is more or less as good for each different player, So while the predictions are player specific, the error is shared across the players.
I am having trouble seeing the big picture here, so I will just start with a stupid linear model and go from there.
$$Actual = a_{[i]j}*x + eta$$import statsmodels.api as sm
import math
QB_df = QB_data.load_df()
all_teams = QB_df.pop('opp')
team_enc = pd.get_dummies(all_teams)
team_enc.pop(0)
home_away = pd.get_dummies(QB_df.pop('home_away'))
home_away.columns = ['bye_inj','away','home']
QB_df = pd.concat([QB_df,team_enc,home_away],axis=1)
QB_df['bye_inj'] = np.double(QB_df['bye_inj'] == 0)
#QB_df = QB_df[QB_df['player_num']<35]
data_endo = QB_df.pop('cbs_actual2')
data_exo = QB_df.ix[:,5:].values
data_exo[:,:17] = data_exo[:,:17]*np.tile(data_exo[:,-3],(17,1)).T
data_exo = sm.add_constant(data_exo, prepend=False)
glm_gamma = sm.GLM(data_endo, data_exo, family=sm.families.Gaussian())
glm_results = glm_gamma.fit()
print(glm_results.summary())
MSE,STE = [],[]
predicting_sites = ['FPTS','fsharks','fftoday','nfl','espn','yahoo','cbs_predict']
for num,predictors in enumerate(predicting_sites):
SE = [(x-data_endo.values[i])**2 for i,x in enumerate(QB_df[predictors].values)]
MSE.append(sum(SE)/len(SE))
STE.append(np.std(SE)/math.sqrt(len(SE)))
#plt.plot(num,MSE)
SE = [(glm_results.predict(x)-data_endo.values[i])**2 for i,x in enumerate(data_exo)]
MSE.append(sum(SE)/len(SE))
STE.append(np.std(SE)/math.sqrt(len(SE)))
plt.errorbar(np.arange(0,8),MSE,yerr=STE);
plt.xlim(-1,8);
plt.ylabel('MSE')
plt.xticks(np.arange(-1,9),['']+predicting_sites+['model']+['']);
Generalized Linear Model Regression Results ============================================================================== Dep. Variable: cbs_actual2 No. Observations: 747 Model: GLM Df Residuals: 699 Model Family: Gaussian Df Model: 47 Link Function: identity Scale: 19.723653825 Method: IRLS Log-Likelihood: -2148.9 Date: Sun, 20 Nov 2016 Deviance: 13787. Time: 16:49:39 Pearson chi2: 1.38e+04 No. Iterations: 4 ============================================================================== coef std err z P>|z| [95.0% Conf. Int.] ------------------------------------------------------------------------------ x1 -0.0586 0.092 -0.640 0.522 -0.238 0.121 x2 0.1899 0.149 1.272 0.203 -0.103 0.483 x3 0.0197 0.024 0.824 0.410 -0.027 0.067 x4 5.0189 3.375 1.487 0.137 -1.596 11.634 x5 -3.0335 1.259 -2.409 0.016 -5.501 -0.566 x6 0.0388 0.021 1.879 0.060 -0.002 0.079 x7 0.4611 0.266 1.735 0.083 -0.060 0.982 x8 0.0613 0.076 0.810 0.418 -0.087 0.210 x9 0.1607 0.150 1.069 0.285 -0.134 0.455 x10 8.9759 3.378 2.657 0.008 2.354 15.597 x11 1.9341 1.303 1.484 0.138 -0.620 4.488 x12 1.9341 1.303 1.484 0.138 -0.620 4.488 x13 0.0508 0.148 0.344 0.731 -0.238 0.340 x14 0.0508 0.148 0.344 0.731 -0.238 0.340 const -2.214e-12 7.83e-13 -2.827 0.005 -3.75e-12 -6.79e-13 x15 -2.2730 1.405 -1.618 0.106 -5.026 0.480 x16 -0.9640 0.550 -1.754 0.079 -2.041 0.113 x17 0.1746 0.095 1.836 0.066 -0.012 0.361 x18 -0.9428 0.664 -1.420 0.156 -2.244 0.359 x19 0.1768 0.041 4.288 0.000 0.096 0.258 x20 -0.1590 0.114 -1.399 0.162 -0.382 0.064 x21 0.2992 0.223 1.339 0.181 -0.139 0.737 x22 1.4151 0.656 2.159 0.031 0.130 2.700 x23 -1.6888 1.844 -0.916 0.360 -5.302 1.925 x24 -6.4333 2.038 -3.156 0.002 -10.428 -2.438 x25 0.7689 1.234 0.623 0.533 -1.650 3.187 x26 1.1333 1.269 0.893 0.372 -1.353 3.620 x27 -3.7517 1.650 -2.273 0.023 -6.986 -0.517 x28 -2.8892 1.857 -1.556 0.120 -6.529 0.751 x29 -0.5590 1.278 -0.437 0.662 -3.063 1.945 x30 5.0072 1.713 2.923 0.003 1.650 8.365 x31 4.0166 1.837 2.187 0.029 0.417 7.616 x32 -0.2773 1.178 -0.235 0.814 -2.585 2.031 x33 1.9189 1.922 0.999 0.318 -1.847 5.685 x34 -0.1508 1.197 -0.126 0.900 -2.496 2.195 x35 0.0854 1.749 0.049 0.961 -3.342 3.513 x36 4.4033 1.782 2.471 0.013 0.911 7.896 x37 5.5240 1.616 3.419 0.001 2.357 8.691 x38 -0.3842 2.097 -0.183 0.855 -4.494 3.725 x39 -1.9738 1.662 -1.188 0.235 -5.230 1.283 x40 -3.1657 2.247 -1.409 0.159 -7.569 1.237 x41 -3.9229 1.867 -2.101 0.036 -7.583 -0.263 x42 -0.6978 1.817 -0.384 0.701 -4.260 2.864 x43 0.6138 1.894 0.324 0.746 -3.098 4.326 x44 0.0339 1.672 0.020 0.984 -3.243 3.311 x45 0.3932 1.821 0.216 0.829 -3.176 3.962 x46 1.7853 1.822 0.980 0.327 -1.785 5.356 x47 1.1296 1.812 0.623 0.533 -2.422 4.681 x48 -2.9710 1.604 -1.852 0.064 -6.115 0.173 x49 0.8453 1.519 0.557 0.578 -2.132 3.822 x50 -1.2069 2.017 -0.598 0.550 -5.161 2.747 x51 -1.0667 1.038 -1.028 0.304 -3.101 0.968 x52 -0.1402 1.087 -0.129 0.897 -2.271 1.991 ==============================================================================
QB_pred_df = QB_data.update(11,pred=True);
team_enc = QB_pred_df.pop('opp')
team_list = np.unique(all_teams[all_teams!=0])
team_mat = pd.DataFrame(np.zeros((np.shape(QB_pred_df)[0],len(team_list))),columns=team_list)
for teams in team_list:
match_index = np.where(team_enc==teams)
if len(match_index)>0: team_mat[teams].ix[match_index] = 1
home_away = pd.get_dummies(QB_pred_df.pop('home_away'))
home_away.columns = ['bye_inj','away','home']
QB_pred_df = pd.concat([QB_pred_df,team_mat,home_away],axis=1)
QB_pred_df['bye_inj'] = np.double(QB_pred_df['bye_inj'] == 0)
pred_df = QB_pred_df[QB_pred_df['week']==11].ix[:,:3]
data_exo = QB_pred_df[QB_pred_df['week']==11].ix[:,5:].values
data_exo[:,:17] = data_exo[:,:17]*np.tile(data_exo[:,-3],(17,1)).T
data_exo = sm.add_constant(data_exo, prepend=False)
predicted_perf = [glm_results.predict(x)[0] for x in data_exo]
pred_df['predicted_points'] = predicted_perf
pred_df.sort_values('predicted_points',ascending=False)
FirstName | LastName | Team | predicted_points | |
---|---|---|---|---|
0 | Cam | Newton | CAR | 30.834317 |
6 | Ben | Roethlisberger | PIT | 22.895674 |
7 | Kirk | Cousins | WAS | 21.951479 |
4 | Drew | Brees | NO | 21.737187 |
1 | Tom | Brady | NE | 20.867042 |
9 | Blake | Bortles | JAC | 20.211135 |
17 | Colin | Kaepernick | SF | 20.176852 |
2 | Andrew | Luck | IND | 20.030794 |
19 | Jameis | Winston | TB | 18.151417 |
10 | Andy | Dalton | CIN | 18.026586 |
15 | Alex | Smith | KC | 17.283195 |
21 | Cody | Kessler | CLE | 17.229598 |
12 | Russell | Wilson | SEA | 17.084727 |
5 | Marcus | Mariota | TEN | 16.994499 |
24 | Ryan | Tannehill | MIA | 16.751200 |
3 | Aaron | Rodgers | GB | 16.371748 |
16 | Eli | Manning | NYG | 16.006075 |
13 | Dak | Prescott | DAL | 15.300529 |
23 | Carson | Wentz | PHI | 15.070783 |
8 | Matthew | Stafford | DET | 14.440080 |
27 | Jared | Goff | LA | 14.014053 |
20 | Carson | Palmer | ARI | 13.239192 |
14 | Tyrod | Taylor | BUF | 12.869826 |
11 | Derek | Carr | OAK | 12.355795 |
26 | Sam | Bradford | MIN | 10.730014 |
18 | Joe | Flacco | BAL | 10.723287 |
22 | Brock | Osweiler | HOU | 9.014153 |
25 | Jay | Cutler | CHI | 7.253098 |
45 | Matt | McGloin | OAK | 0.136155 |
68 | Joe | Webb | CAR | 0.083788 |
... | ... | ... | ... | ... |
76 | Garrett | Grayson | NO | 0.000000 |
77 | Christian | Ponder | SF | 0.000000 |
78 | Stephen | Morris | IND | 0.000000 |
79 | Jameill | Showers | DAL | 0.000000 |
80 | Mark | Sanchez | DAL | 0.000000 |
81 | Kellen | Moore | DAL | 0.000000 |
62 | Kevin | Hogan | CLE | 0.000000 |
82 | David | Fales | BAL | 0.000000 |
60 | Alex | Tanney | TEN | 0.000000 |
57 | Nate | Sudfeld | WAS | 0.000000 |
56 | Joel | Stave | KC | 0.000000 |
59 | Cardale | Jones | BUF | 0.000000 |
30 | Landry | Jones | PIT | -0.099687 |
51 | Ryan | Mallett | BAL | -0.117142 |
54 | Matt | Cassel | TEN | -0.117142 |
46 | Matt | Barkley | CHI | -0.124125 |
34 | Shaun | Hill | MIN | -0.124125 |
52 | Drew | Stanton | ARI | -0.124125 |
49 | E.J. | Manuel | BUF | -0.159036 |
48 | Derek | Anderson | CAR | -0.206356 |
47 | Jimmy | Garoppolo | NE | -0.255232 |
37 | Scott | Tolzien | IND | -0.255232 |
53 | Trevone | Boykin | SEA | -0.262214 |
36 | Tom | Savage | HOU | -0.269196 |
50 | Mike | Glennon | TB | -0.276179 |
32 | Colt | McCoy | WAS | -0.428233 |
38 | Case | Keenum | LA | -0.573305 |
29 | Blaine | Gabbert | SF | -0.739323 |
42 | Nick | Foles | KC | -0.966626 |
39 | Josh | McCown | CLE | -1.057396 |
83 rows × 4 columns
So I have individual players across all the weeks. One thing that I want to do is to use past performance to predict future performance, but how to do this is confusing... also I feel like I should have a heirarchy somewhere, but where is unclear.
Ok. So maybe this is the best way to think about it. Every player has a true mean and sd for scoring and these truths exert influence on the predictions and past performance, and then observed behavior is drawn from this. Then there are things that modulate his true mean like who the opponent is.
import pystan
pooled_model = """
data {
int<lower=0> N;
vector[N] x;
vector[N] y;
}
parameters {
vector[2] beta;
real<lower=0> sigma;
}
model {
y ~ normal(beta[1] + beta[2] * x, sigma);
}
"""
pooled_data_dict = {'N': len(data_endo),
'x': QB_data.yahoo.values,
'y': data_endo.values}
pooled_fit = pystan.stan(model_code=pooled_model, data=pooled_data_dict, iter=1000, chains=2)
pooled_sample = pooled_fit.extract(permuted=True)
b0, m0 = pooled_sample['beta'].T.mean(1)
plt.scatter(srrs_mn.floor, np.log(srrs_mn.activity+0.1))
xvals = np.linspace(-0.2, 1.2)
plt.plot(xvals, m0*xvals+b0, 'r--')
import pystan
schools_code = """
data {
int<lower=0> J; // number of schools
real y[J]; // estimated treatment effects
real<lower=0> sigma[J]; // s.e. of effect estimates
}
parameters {
real mu;
real<lower=0> tau;
real eta[J];
}
transformed parameters {
real theta[J];
for (j in 1:J)
theta[j] <- mu + tau * eta[j];
}
model {
eta ~ normal(0, 1);
y ~ normal(theta, sigma);
}
"""
schools_dat = {'J': 8,
'y': [28, 8, -3, 7, -1, 1, 18, 12],
'sigma': [15, 10, 16, 11, 9, 11, 10, 18]}
fit = pystan.stan(model_code=schools_code, data=schools_dat,
iter=1000, chains=4)
season=2016
pos_dict = {'QB':10, 'RB':20, 'WR':30, 'TE':40, 'K':80}
league_dict = {'fftoday':1,'fft_ppr':107644,'yahoo':17,'FFPC':107437,'NFFC':5}
week=2
position='QB'
urls = dict()
urls['fftoday'] = "http://www.fftoday.com/rankings/playerwkproj.php?Season=%d&GameWeek=%d&PosID=%d&LeagueID=%d"\
% (season,week,pos_dict[position],league_dict['fft_ppr'])
keys = 'fftoday'
df = pd.read_html(urls[keys],match='FFPts',header=16)[0]