In [1]:

import pandas as pd, numpy as np, matplotlib.pyplot as plt, math, statsmodels.api as sm

%matplotlib inline
plt.style.use('ggplot')

from ffball_getData import ffball_data

QB_data = ffball_data('QB')
#for weeks in range(2,11):
#    QB_data.update(weeks)

#QB_data.save_df()

Change coding of bye week; one hot encoding for home-away; interaction effects; remove no opponent; code teams for prediction by looping through team names; try a gamma link; yahoo?

In [2]:

#axs = pd.tools.plotting.scatter_matrix(QB_data.ix[:,-8:], figsize=(6,6), diagonal='kde');
#[ax.set_yticks([]) for ax in axs[:,0]] #turn off the ticks that take up way too much space in such a crammed figure
#[ax.set_xticks([]) for ax in axs[-1,:]];

So I want past performance to influence current predictions and both of them to influence the predicted variable. Is this possible? I am not sure about the time thing. It seems like it should be part of the model, but I am uncertain about how to include it. I should also probably have the matchup in the model. There is nothing about matchup in this model and that would be helpful. How to get that data and how exactly to give it to the model is a mystery...

Ok, then there is the thing of making player specific predictions, but hopefully every prediction is more or less as good for each different player, So while the predictions are player specific, the error is shared across the players.

I am having trouble seeing the big picture here, so I will just start with a stupid linear model and go from there.

$$Actual = a_{[i]j}*x + eta$$

In [4]:

import statsmodels.api as sm
import math

QB_df = QB_data.load_df()
all_teams = QB_df.pop('opp')
team_enc = pd.get_dummies(all_teams)
team_enc.pop(0)
home_away = pd.get_dummies(QB_df.pop('home_away'))
home_away.columns = ['bye_inj','away','home']
QB_df = pd.concat([QB_df,team_enc,home_away],axis=1)
QB_df['bye_inj'] = np.double(QB_df['bye_inj'] == 0)
#QB_df = QB_df[QB_df['player_num']<35]
data_endo = QB_df.pop('cbs_actual2')
data_exo = QB_df.ix[:,5:].values
data_exo[:,:17] = data_exo[:,:17]*np.tile(data_exo[:,-3],(17,1)).T
data_exo = sm.add_constant(data_exo, prepend=False)


glm_gamma = sm.GLM(data_endo, data_exo, family=sm.families.Gaussian())
glm_results = glm_gamma.fit()
print(glm_results.summary())

MSE,STE = [],[]
predicting_sites = ['FPTS','fsharks','fftoday','nfl','espn','yahoo','cbs_predict']
for num,predictors in enumerate(predicting_sites):
    SE = [(x-data_endo.values[i])**2 for i,x in enumerate(QB_df[predictors].values)]
    MSE.append(sum(SE)/len(SE))
    STE.append(np.std(SE)/math.sqrt(len(SE)))
    #plt.plot(num,MSE)
SE = [(glm_results.predict(x)-data_endo.values[i])**2 for i,x in enumerate(data_exo)]
MSE.append(sum(SE)/len(SE))
STE.append(np.std(SE)/math.sqrt(len(SE)))

plt.errorbar(np.arange(0,8),MSE,yerr=STE);
plt.xlim(-1,8);
plt.ylabel('MSE')
plt.xticks(np.arange(-1,9),['']+predicting_sites+['model']+['']);

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:            cbs_actual2   No. Observations:                  747
Model:                            GLM   Df Residuals:                      699
Model Family:                Gaussian   Df Model:                           47
Link Function:               identity   Scale:                    19.723653825
Method:                          IRLS   Log-Likelihood:                -2148.9
Date:                Sun, 20 Nov 2016   Deviance:                       13787.
Time:                        16:49:39   Pearson chi2:                 1.38e+04
No. Iterations:                     4                                         
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1            -0.0586      0.092     -0.640      0.522        -0.238     0.121
x2             0.1899      0.149      1.272      0.203        -0.103     0.483
x3             0.0197      0.024      0.824      0.410        -0.027     0.067
x4             5.0189      3.375      1.487      0.137        -1.596    11.634
x5            -3.0335      1.259     -2.409      0.016        -5.501    -0.566
x6             0.0388      0.021      1.879      0.060        -0.002     0.079
x7             0.4611      0.266      1.735      0.083        -0.060     0.982
x8             0.0613      0.076      0.810      0.418        -0.087     0.210
x9             0.1607      0.150      1.069      0.285        -0.134     0.455
x10            8.9759      3.378      2.657      0.008         2.354    15.597
x11            1.9341      1.303      1.484      0.138        -0.620     4.488
x12            1.9341      1.303      1.484      0.138        -0.620     4.488
x13            0.0508      0.148      0.344      0.731        -0.238     0.340
x14            0.0508      0.148      0.344      0.731        -0.238     0.340
const      -2.214e-12   7.83e-13     -2.827      0.005     -3.75e-12 -6.79e-13
x15           -2.2730      1.405     -1.618      0.106        -5.026     0.480
x16           -0.9640      0.550     -1.754      0.079        -2.041     0.113
x17            0.1746      0.095      1.836      0.066        -0.012     0.361
x18           -0.9428      0.664     -1.420      0.156        -2.244     0.359
x19            0.1768      0.041      4.288      0.000         0.096     0.258
x20           -0.1590      0.114     -1.399      0.162        -0.382     0.064
x21            0.2992      0.223      1.339      0.181        -0.139     0.737
x22            1.4151      0.656      2.159      0.031         0.130     2.700
x23           -1.6888      1.844     -0.916      0.360        -5.302     1.925
x24           -6.4333      2.038     -3.156      0.002       -10.428    -2.438
x25            0.7689      1.234      0.623      0.533        -1.650     3.187
x26            1.1333      1.269      0.893      0.372        -1.353     3.620
x27           -3.7517      1.650     -2.273      0.023        -6.986    -0.517
x28           -2.8892      1.857     -1.556      0.120        -6.529     0.751
x29           -0.5590      1.278     -0.437      0.662        -3.063     1.945
x30            5.0072      1.713      2.923      0.003         1.650     8.365
x31            4.0166      1.837      2.187      0.029         0.417     7.616
x32           -0.2773      1.178     -0.235      0.814        -2.585     2.031
x33            1.9189      1.922      0.999      0.318        -1.847     5.685
x34           -0.1508      1.197     -0.126      0.900        -2.496     2.195
x35            0.0854      1.749      0.049      0.961        -3.342     3.513
x36            4.4033      1.782      2.471      0.013         0.911     7.896
x37            5.5240      1.616      3.419      0.001         2.357     8.691
x38           -0.3842      2.097     -0.183      0.855        -4.494     3.725
x39           -1.9738      1.662     -1.188      0.235        -5.230     1.283
x40           -3.1657      2.247     -1.409      0.159        -7.569     1.237
x41           -3.9229      1.867     -2.101      0.036        -7.583    -0.263
x42           -0.6978      1.817     -0.384      0.701        -4.260     2.864
x43            0.6138      1.894      0.324      0.746        -3.098     4.326
x44            0.0339      1.672      0.020      0.984        -3.243     3.311
x45            0.3932      1.821      0.216      0.829        -3.176     3.962
x46            1.7853      1.822      0.980      0.327        -1.785     5.356
x47            1.1296      1.812      0.623      0.533        -2.422     4.681
x48           -2.9710      1.604     -1.852      0.064        -6.115     0.173
x49            0.8453      1.519      0.557      0.578        -2.132     3.822
x50           -1.2069      2.017     -0.598      0.550        -5.161     2.747
x51           -1.0667      1.038     -1.028      0.304        -3.101     0.968
x52           -0.1402      1.087     -0.129      0.897        -2.271     1.991
==============================================================================

In [5]:

QB_pred_df = QB_data.update(11,pred=True);
team_enc = QB_pred_df.pop('opp')

team_list = np.unique(all_teams[all_teams!=0])
team_mat = pd.DataFrame(np.zeros((np.shape(QB_pred_df)[0],len(team_list))),columns=team_list)
for teams in team_list:
    match_index = np.where(team_enc==teams)
    if len(match_index)>0: team_mat[teams].ix[match_index] = 1
    
home_away = pd.get_dummies(QB_pred_df.pop('home_away'))
home_away.columns = ['bye_inj','away','home']
QB_pred_df = pd.concat([QB_pred_df,team_mat,home_away],axis=1)
QB_pred_df['bye_inj'] = np.double(QB_pred_df['bye_inj'] == 0)

pred_df = QB_pred_df[QB_pred_df['week']==11].ix[:,:3]

data_exo = QB_pred_df[QB_pred_df['week']==11].ix[:,5:].values
data_exo[:,:17] = data_exo[:,:17]*np.tile(data_exo[:,-3],(17,1)).T
data_exo = sm.add_constant(data_exo, prepend=False)

predicted_perf = [glm_results.predict(x)[0] for x in data_exo]
pred_df['predicted_points'] = predicted_perf
pred_df.sort_values('predicted_points',ascending=False)

Out[5]:

	FirstName	LastName	Team	predicted_points
0	Cam	Newton	CAR	30.834317
6	Ben	Roethlisberger	PIT	22.895674
7	Kirk	Cousins	WAS	21.951479
4	Drew	Brees	NO	21.737187
1	Tom	Brady	NE	20.867042
9	Blake	Bortles	JAC	20.211135
17	Colin	Kaepernick	SF	20.176852
2	Andrew	Luck	IND	20.030794
19	Jameis	Winston	TB	18.151417
10	Andy	Dalton	CIN	18.026586
15	Alex	Smith	KC	17.283195
21	Cody	Kessler	CLE	17.229598
12	Russell	Wilson	SEA	17.084727
5	Marcus	Mariota	TEN	16.994499
24	Ryan	Tannehill	MIA	16.751200
3	Aaron	Rodgers	GB	16.371748
16	Eli	Manning	NYG	16.006075
13	Dak	Prescott	DAL	15.300529
23	Carson	Wentz	PHI	15.070783
8	Matthew	Stafford	DET	14.440080
27	Jared	Goff	LA	14.014053
20	Carson	Palmer	ARI	13.239192
14	Tyrod	Taylor	BUF	12.869826
11	Derek	Carr	OAK	12.355795
26	Sam	Bradford	MIN	10.730014
18	Joe	Flacco	BAL	10.723287
22	Brock	Osweiler	HOU	9.014153
25	Jay	Cutler	CHI	7.253098
45	Matt	McGloin	OAK	0.136155
68	Joe	Webb	CAR	0.083788
...	...	...	...	...
76	Garrett	Grayson	NO	0.000000
77	Christian	Ponder	SF	0.000000
78	Stephen	Morris	IND	0.000000
79	Jameill	Showers	DAL	0.000000
80	Mark	Sanchez	DAL	0.000000
81	Kellen	Moore	DAL	0.000000
62	Kevin	Hogan	CLE	0.000000
82	David	Fales	BAL	0.000000
60	Alex	Tanney	TEN	0.000000
57	Nate	Sudfeld	WAS	0.000000
56	Joel	Stave	KC	0.000000
59	Cardale	Jones	BUF	0.000000
30	Landry	Jones	PIT	-0.099687
51	Ryan	Mallett	BAL	-0.117142
54	Matt	Cassel	TEN	-0.117142
46	Matt	Barkley	CHI	-0.124125
34	Shaun	Hill	MIN	-0.124125
52	Drew	Stanton	ARI	-0.124125
49	E.J.	Manuel	BUF	-0.159036
48	Derek	Anderson	CAR	-0.206356
47	Jimmy	Garoppolo	NE	-0.255232
37	Scott	Tolzien	IND	-0.255232
53	Trevone	Boykin	SEA	-0.262214
36	Tom	Savage	HOU	-0.269196
50	Mike	Glennon	TB	-0.276179
32	Colt	McCoy	WAS	-0.428233
38	Case	Keenum	LA	-0.573305
29	Blaine	Gabbert	SF	-0.739323
42	Nick	Foles	KC	-0.966626
39	Josh	McCown	CLE	-1.057396

83 rows × 4 columns

So I have individual players across all the weeks. One thing that I want to do is to use past performance to predict future performance, but how to do this is confusing... also I feel like I should have a heirarchy somewhere, but where is unclear.

Ok. So maybe this is the best way to think about it. Every player has a true mean and sd for scoring and these truths exert influence on the predictions and past performance, and then observed behavior is drawn from this. Then there are things that modulate his true mean like who the opponent is.

In [ ]:

import pystan

pooled_model = """
data {
  int<lower=0> N; 
  vector[N] x;
  vector[N] y;
}
parameters {
  vector[2] beta;
  real<lower=0> sigma;
} 
model {
  y ~ normal(beta[1] + beta[2] * x, sigma);
}
"""


pooled_data_dict = {'N': len(data_endo),
               'x': QB_data.yahoo.values,
               'y': data_endo.values}

pooled_fit = pystan.stan(model_code=pooled_model, data=pooled_data_dict, iter=1000, chains=2)

In [ ]:

pooled_sample = pooled_fit.extract(permuted=True)
b0, m0 = pooled_sample['beta'].T.mean(1)


plt.scatter(srrs_mn.floor, np.log(srrs_mn.activity+0.1))
xvals = np.linspace(-0.2, 1.2)
plt.plot(xvals, m0*xvals+b0, 'r--')

In [ ]:

import pystan
schools_code = """
data {
    int<lower=0> J; // number of schools
    real y[J]; // estimated treatment effects
    real<lower=0> sigma[J]; // s.e. of effect estimates
}
parameters {
    real mu;
    real<lower=0> tau;
    real eta[J];
}
transformed parameters {
    real theta[J];
    for (j in 1:J)
    theta[j] <- mu + tau * eta[j];
}
model {
    eta ~ normal(0, 1);
    y ~ normal(theta, sigma);
}
"""

schools_dat = {'J': 8,
               'y': [28,  8, -3,  7, -1,  1, 18, 12],
               'sigma': [15, 10, 16, 11,  9, 11, 10, 18]}

fit = pystan.stan(model_code=schools_code, data=schools_dat,
                  iter=1000, chains=4)

In [ ]:

In [29]:

season=2016
pos_dict = {'QB':10, 'RB':20, 'WR':30, 'TE':40, 'K':80}
league_dict = {'fftoday':1,'fft_ppr':107644,'yahoo':17,'FFPC':107437,'NFFC':5}
week=2
position='QB'

urls = dict()
urls['fftoday'] = "http://www.fftoday.com/rankings/playerwkproj.php?Season=%d&GameWeek=%d&PosID=%d&LeagueID=%d"\
    % (season,week,pos_dict[position],league_dict['fft_ppr'])

keys = 'fftoday'
df = pd.read_html(urls[keys],match='FFPts',header=16)[0]