In [1]:

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels import regression
from sklearn import ensemble
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import train_test_split

Data¶

In [2]:

# Remove aggregate rows, replace NaN with 0

puf = pd.read_csv('puf2011.csv')

puf = puf[(puf['RECID'] != 999996) &
          (puf['RECID'] != 999997) &
          (puf['RECID'] != 999998) &
          (puf['RECID'] != 999999)
         ]
           
puf = puf.fillna(0)

# Constant column

puf['constant'] = 1

#  MARS to K - 1 dummies

puf[['MARS2', 'MARS3', 'MARS4']] = pd.get_dummies(puf['MARS'], drop_first = True)

# E19800 and E20100 combined in CPS

puf['E19800_E20100'] = puf['E19800'] + puf['E20100']

# Categorical dependent variable for 1st stage

puf['sign'] = np.where(puf['P22250'] == 0, 'zer', np.where(puf['P22250'] > 0, 'pos', 'neg'))

# All variables shared between puf and cps, except for E00650 (colinear w/E00600)

predictors =  [
              'DSI', 'EIC', 'MARS2', 'MARS3', 'MARS4', 'XTOT', 
              'E00200', 'E00300', 'E00400', 'E00600', 'E00800', 'E00900', 
              'E01400', 'E01500', 'E01700', 'E02100', 'E02300', 'E02400', 
              'E03150', 'E03210', 'E03240', 'E03270', 'E03300', 'E17500', 
              'E18400', 'E18500', 'E19200', 'E19800_E20100','E20400', 
              'E32800', 'F2441', 'N24', 'E01100'
              ]

discretes = ['DSI', 'EIC', 'MARS2', 'MARS3', 'MARS4', 'XTOT', 'F2441', 'N24']

# Create log columns of continuous predictors/response 

logs = []

for i in predictors + ['P22250']:
   
    if i not in discretes:
        
        if puf[i].min() >= 1:
            
            puf['log_' + i] = np.log(puf[i])
       
        puf['log_' + i] = np.log(puf[i] - puf[i].min() + 1)
          
        logs.append('log_' + i)

keep = ['RECID', 'AGIR1', 'sign', 'P22250', 'constant'] + predictors + logs

puf = puf[keep]

np.random.seed(100)

train, test = train_test_split(puf.copy(), test_size=0.2)

# Drop two negative outliers

train = train.drop(labels = [46928, 143309])

# Sub-df's where P22250 > 0 or < 0 pos or neg for 2nd stage imputation

pos_train = train.copy()[train.copy()['P22250'] > 0]
neg_train = train.copy()[train.copy()['P22250'] < 0]

pos_test =  test.copy()[test.copy()['P22250'] > 0]
neg_test =  test.copy()[test.copy()['P22250'] < 0]

One-stage prediction: Random Forests¶

Using rf.predict

In [3]:

# 100 estimators
N_ESTIMATORS = 100
rf = ensemble.RandomForestRegressor(n_estimators=N_ESTIMATORS, 
                                    min_samples_leaf=1, random_state=3, 
                                    verbose=True, 
                                    n_jobs=-1)  # Use maximum number of cores.

rf.fit(train[predictors], train['P22250'])

rf_predictions = rf.predict(test[predictors])

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.0min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished

rf_preds = array of estimators

In [4]:

rf_preds = []
for estimator in rf.estimators_:
    rf_preds.append(estimator.predict(test[predictors]))
rf_preds = np.array(rf_preds).transpose()  # One row per record.

One-Stage Validation¶

Log-loss¶

We can calculate the RF model's predicted probability for each sign (and thus its log-loss) using the % of estimators predicting that sign for each observation.

Note: metrics.log_loss() assumes that the order of the columns of predicted probabilities correspond to their categories' alphabetical order. Our categories are 'neg', 'zer' and 'pos', which have the alphabetical order of 'neg', 'pos', 'zer', thus they appear in that order in rf_pred_proba as [preds_neg, preds_pos, preds_zer]

In [5]:

preds_neg = np.sum(rf_preds < 0, axis=1) / 100
preds_zer = np.sum(rf_preds == 0, axis=1) / 100
preds_pos = np.sum(rf_preds > 0, axis=1) / 100

rf_pred_proba = list(map(list, zip(*[preds_neg, preds_pos, preds_zer])))

metrics.log_loss(test['sign'], rf_pred_proba)

Out[5]:

0.6897766709507555

RMSE¶

In [6]:

(test['P22250'] - rf_predictions).pow(2).mean()** 0.5

Out[6]:

411117.31955911306

Two-stage Prediction: mnlogit + OLS¶

1st stage: mnlogit¶

In [7]:

# Formula = sign ~ DSI + EIC + ... + F2441 + N24
# It turns out that only 1.3% of E01100 is non-zero, which could be why 
# including it as a predictor breaks mnlogit. E01100 thus excluded.

predictors_mnl = predictors[:-1]
formula = 'sign ~ ' + ' + '.join(predictors_mnl)
mnl_fit = smf.mnlogit(formula = formula, data = train).fit()
summary = mnl_fit.summary()
summary

Optimization terminated successfully.
         Current function value: 0.577166
         Iterations 9

Out[7]:

MNLogit Regression Results
Dep. Variable:	y	No. Observations:	131026
Model:	MNLogit	Df Residuals:	130960
Method:	MLE	Df Model:	64
Date:	Fri, 10 Aug 2018	Pseudo R-squ.:	0.2373
Time:	16:06:26	Log-Likelihood:	-75624.
converged:	True	LL-Null:	-99159.
		LLR p-value:	0.000

y=sign[pos]	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	-0.1281	0.034	-3.727	0.000	-0.195	-0.061
DSI	0.1601	0.147	1.092	0.275	-0.127	0.447
EIC	-0.2173	0.118	-1.848	0.065	-0.448	0.013
MARS2	-0.0308	0.041	-0.752	0.452	-0.111	0.050
MARS3	-0.0040	0.080	-0.051	0.959	-0.160	0.152
MARS4	0.0132	0.079	0.167	0.868	-0.142	0.169
XTOT	-0.0009	0.019	-0.047	0.963	-0.037	0.035
E00200	-1.787e-08	7.39e-09	-2.419	0.016	-3.23e-08	-3.39e-09
E00300	9.886e-08	3.2e-08	3.086	0.002	3.61e-08	1.62e-07
E00400	-1.294e-07	5.07e-08	-2.550	0.011	-2.29e-07	-3e-08
E00600	-1.73e-08	1.76e-08	-0.980	0.327	-5.19e-08	1.73e-08
E00800	-1.384e-05	1.03e-05	-1.350	0.177	-3.39e-05	6.26e-06
E00900	-3.552e-08	3.51e-08	-1.011	0.312	-1.04e-07	3.34e-08
E01400	-5.094e-09	6.17e-08	-0.083	0.934	-1.26e-07	1.16e-07
E01500	-5.403e-09	2.02e-08	-0.267	0.789	-4.5e-08	3.42e-08
E01700	-7.231e-08	1.64e-07	-0.441	0.659	-3.93e-07	2.49e-07
E02100	2.644e-08	1.16e-07	0.228	0.819	-2.01e-07	2.53e-07
E02300	-1.062e-05	6.17e-06	-1.721	0.085	-2.27e-05	1.48e-06
E02400	-3.856e-06	9.18e-07	-4.200	0.000	-5.66e-06	-2.06e-06
E03150	5.022e-07	6.84e-06	0.073	0.941	-1.29e-05	1.39e-05
E03210	-6.289e-05	0.000	-0.623	0.533	-0.000	0.000
E03240	-3.404e-07	1.68e-07	-2.029	0.042	-6.69e-07	-1.16e-08
E03270	5.258e-06	1.77e-06	2.963	0.003	1.78e-06	8.74e-06
E03300	-3.957e-07	4.66e-07	-0.849	0.396	-1.31e-06	5.17e-07
E17500	-8.072e-07	7.62e-07	-1.060	0.289	-2.3e-06	6.86e-07
E18400	3.184e-09	3.42e-08	0.093	0.926	-6.39e-08	7.02e-08
E18500	-3.136e-07	4.36e-07	-0.720	0.472	-1.17e-06	5.4e-07
E19200	3.845e-07	1.36e-07	2.824	0.005	1.18e-07	6.51e-07
E19800_E20100	-2.573e-09	2.3e-08	-0.112	0.911	-4.77e-08	4.26e-08
E20400	-3.691e-08	1.02e-07	-0.361	0.718	-2.37e-07	1.64e-07
E32800	-6.243e-06	2.82e-05	-0.222	0.825	-6.15e-05	4.9e-05
F2441	0.0010	0.058	0.018	0.986	-0.113	0.115
N24	0.0457	0.020	2.242	0.025	0.006	0.086
y=sign[zer]	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	3.0321	0.028	107.842	0.000	2.977	3.087
DSI	0.9075	0.107	8.481	0.000	0.698	1.117
EIC	1.3177	0.074	17.715	0.000	1.172	1.463
MARS2	-0.6360	0.034	-18.458	0.000	-0.704	-0.568
MARS3	-0.3229	0.069	-4.680	0.000	-0.458	-0.188
MARS4	0.4112	0.060	6.797	0.000	0.293	0.530
XTOT	-0.1057	0.017	-6.218	0.000	-0.139	-0.072
E00200	-8.375e-07	2.25e-08	-37.160	0.000	-8.82e-07	-7.93e-07
E00300	-2.924e-06	1.81e-07	-16.191	0.000	-3.28e-06	-2.57e-06
E00400	-3.313e-06	2.22e-07	-14.956	0.000	-3.75e-06	-2.88e-06
E00600	-1.28e-06	1e-07	-12.795	0.000	-1.48e-06	-1.08e-06
E00800	-1.901e-05	5.84e-06	-3.253	0.001	-3.05e-05	-7.55e-06
E00900	7.825e-09	4.14e-08	0.189	0.850	-7.32e-08	8.89e-08
E01400	-1.379e-06	1.38e-07	-9.966	0.000	-1.65e-06	-1.11e-06
E01500	-3.46e-07	2.92e-08	-11.847	0.000	-4.03e-07	-2.89e-07
E01700	-3.175e-07	2.12e-07	-1.501	0.133	-7.32e-07	9.71e-08
E02100	2.102e-07	1.3e-07	1.617	0.106	-4.45e-08	4.65e-07
E02300	2.953e-05	4.44e-06	6.657	0.000	2.08e-05	3.82e-05
E02400	-1.557e-05	8.66e-07	-17.982	0.000	-1.73e-05	-1.39e-05
E03150	-4.886e-05	6.17e-06	-7.917	0.000	-6.1e-05	-3.68e-05
E03210	0.0008	6.9e-05	10.871	0.000	0.001	0.001
E03240	7e-08	2.93e-07	0.239	0.811	-5.05e-07	6.45e-07
E03270	-2.533e-05	1.92e-06	-13.166	0.000	-2.91e-05	-2.16e-05
E03300	-1.135e-05	6.77e-07	-16.772	0.000	-1.27e-05	-1e-05
E17500	-5.357e-06	1.06e-06	-5.076	0.000	-7.42e-06	-3.29e-06
E18400	-7.802e-07	1.28e-07	-6.077	0.000	-1.03e-06	-5.29e-07
E18500	-3.401e-05	1e-06	-33.976	0.000	-3.6e-05	-3.2e-05
E19200	-5.089e-07	3.67e-07	-1.388	0.165	-1.23e-06	2.1e-07
E19800_E20100	-1.675e-08	8.39e-08	-0.200	0.842	-1.81e-07	1.48e-07
E20400	-4.657e-05	1.15e-06	-40.479	0.000	-4.88e-05	-4.43e-05
E32800	0.0001	2.36e-05	5.563	0.000	8.49e-05	0.000
F2441	-0.1777	0.049	-3.591	0.000	-0.275	-0.081
N24	0.1079	0.019	5.747	0.000	0.071	0.145

Extracting probabilities

In [8]:

# mnlogit.fit.predict() bug causes columns to lose labels. However according to the documentation 
# column 0 = omitted category (which is 'neg' as it is the first alphabetically). We can infer that
# the most likely column, column 2, is 'zer', thus column 1 = 'pos'

mnl_pred_proba = mnl_fit.predict(test[predictors_mnl])
mnl_pred_proba.rename(columns={0: 'neg', 1: 'pos', 2: 'zero'}, inplace = True)

# Note that columns are alphabetically ordered, so this will work for metrics.log_loss()

mnl_pred_proba.head()

Out[8]:

	neg	pos	zero
43632	0.009440	0.007075	0.983485
55268	0.150469	0.135993	0.713538
155007	0.403581	0.321199	0.275220
46437	0.061226	0.050625	0.888149
30152	0.143453	0.121501	0.735047

Stochastic sign imputation function

In [9]:

# Arguments = 'runif': a random uniform in [0,1], and 'probs': a vector of predicted probabilities for each sign. 
# Output = predicted sign if associated CDF value > runif.
# probs[0] = P(neg), probs[1] = P(pos), probs[2] = P(zer)

def stoch_imp(runif, probs):
    
    if runif < probs[0]:
        return ('neg')
    
    elif runif < probs[0] + probs[1]:
        return ('pos')
    
    else:
        return ('zer')

# Creates vector of runifs the length of puf, to be used in each imputation.

np.random.seed(1995)
runifs = np.random.uniform(size = len(test))

In [10]:

mnl_pred_sign = []

count = 0

for i in runifs:
    probs = mnl_pred_proba.iloc[count]
    mnl_pred_sign.append(stoch_imp(i, probs))
    count += 1

Sign imputation

In [11]:

test['sign_p'] = mnl_pred_sign

2nd stage: OLS¶

Regressing on log(P22250) using (now) correctly log-transformed continuous predictors

Positive data¶

In [12]:

# Predictors included are those with p values <= 0.1

ols_pos_predictors = [
                      'constant', 'MARS2', 'F2441', 'N24', 'log_E00200',
                      'log_E00300', 'log_E00400', 'log_E00600', 'E00900',
                      'log_E01500', 'E02400', 'log_E03240', 'E03270',
                      'log_E03300', 'log_E17500', 'log_E18400', 'log_E18500',
                      'log_E19200', 'log_E19800_E20100', 'log_E20400'
                     ]

ols_pos_fit = sm.OLS(pos_train['log_P22250'], pos_train[ols_pos_predictors]).fit()
summary = ols_pos_fit.summary()

summary

Out[12]:

OLS Regression Results
Dep. Variable:	log_P22250	R-squared:	0.041
Model:	OLS	Adj. R-squared:	0.040
Method:	Least Squares	F-statistic:	35.48
Date:	Fri, 10 Aug 2018	Prob (F-statistic):	8.55e-128
Time:	16:06:30	Log-Likelihood:	61122.
No. Observations:	15800	AIC:	-1.222e+05
Df Residuals:	15780	BIC:	-1.221e+05
Df Model:	19
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
constant	18.6419	0.000	1.16e+05	0.000	18.642	18.642
MARS2	-0.0005	0.000	-4.188	0.000	-0.001	-0.000
F2441	-0.0002	0.000	-1.768	0.077	-0.000	2.5e-05
N24	0.0003	4.79e-05	5.454	0.000	0.000	0.000
log_E00200	-3.606e-05	8.61e-06	-4.187	0.000	-5.29e-05	-1.92e-05
log_E00300	0.0002	1.5e-05	12.893	0.000	0.000	0.000
log_E00400	-3.266e-05	1.04e-05	-3.148	0.002	-5.3e-05	-1.23e-05
log_E00600	9.789e-05	1.58e-05	6.210	0.000	6.7e-05	0.000
E00900	-3.534e-10	1.37e-10	-2.575	0.010	-6.22e-10	-8.43e-11
log_E01500	-1.876e-05	8.44e-06	-2.224	0.026	-3.53e-05	-2.23e-06
E02400	-6.391e-09	3.58e-09	-1.785	0.074	-1.34e-08	6.26e-10
log_E03240	-5.125e-05	1.52e-05	-3.365	0.001	-8.11e-05	-2.14e-05
E03270	2.709e-08	6.82e-09	3.975	0.000	1.37e-08	4.04e-08
log_E03300	-3.966e-05	1.37e-05	-2.904	0.004	-6.64e-05	-1.29e-05
log_E17500	-3.875e-05	1.87e-05	-2.077	0.038	-7.53e-05	-2.18e-06
log_E18400	2.361e-05	1.02e-05	2.307	0.021	3.55e-06	4.37e-05
log_E18500	-8.303e-05	1.69e-05	-4.920	0.000	-0.000	-4.99e-05
log_E19200	6.695e-05	1.12e-05	5.960	0.000	4.49e-05	8.9e-05
log_E19800_E20100	3.368e-05	1.62e-05	2.082	0.037	1.97e-06	6.54e-05
log_E20400	4.084e-05	1.19e-05	3.418	0.001	1.74e-05	6.43e-05

Omnibus:	38405.648	Durbin-Watson:	1.998
Prob(Omnibus):	0.000	Jarque-Bera (JB):	661684037.771
Skew:	25.403	Prob(JB):	0.00
Kurtosis:	1004.253	Cond. No.	1.24e+06

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.24e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

Negative data¶

In [13]:

ols_neg_predictors = [
                      'constant', 'MARS3', 'MARS4', 'log_E00200', 'log_E00300',
                      'log_E00400', 'log_E00600', 'log_E00900', 'log_E03210',
                      'log_E03270', 'log_E17500', 'log_E18400', 'log_E19200',
                     'log_E19800_E20100'
                     ] 
ols_neg_fit = sm.OLS(neg_train['log_P22250'], neg_train[ols_neg_predictors]).fit()
summary = ols_neg_fit.summary()

summary

Out[13]:

OLS Regression Results
Dep. Variable:	log_P22250	R-squared:	0.013
Model:	OLS	Adj. R-squared:	0.012
Method:	Least Squares	F-statistic:	18.64
Date:	Fri, 10 Aug 2018	Prob (F-statistic):	5.25e-44
Time:	16:06:30	Log-Likelihood:	71202.
No. Observations:	18598	AIC:	-1.424e+05
Df Residuals:	18584	BIC:	-1.423e+05
Df Model:	13
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
constant	18.4684	0.067	274.766	0.000	18.337	18.600
MARS3	-0.0011	0.000	-4.056	0.000	-0.002	-0.001
MARS4	-0.0006	0.000	-2.159	0.031	-0.001	-5.24e-05
log_E00200	3.726e-05	7.59e-06	4.908	0.000	2.24e-05	5.21e-05
log_E00300	-9.291e-05	1.47e-05	-6.320	0.000	-0.000	-6.41e-05
log_E00400	3.854e-05	9.9e-06	3.892	0.000	1.91e-05	5.8e-05
log_E00600	-0.0001	1.55e-05	-6.960	0.000	-0.000	-7.76e-05
log_E00900	0.0101	0.004	2.600	0.009	0.002	0.018
log_E03210	-0.0001	6.4e-05	-1.741	0.082	-0.000	1.4e-05
log_E03270	3.825e-05	1.07e-05	3.584	0.000	1.73e-05	5.92e-05
log_E17500	4.68e-05	1.74e-05	2.692	0.007	1.27e-05	8.09e-05
log_E18400	2.08e-05	9.56e-06	2.176	0.030	2.06e-06	3.95e-05
log_E19200	-4.709e-05	9.63e-06	-4.892	0.000	-6.6e-05	-2.82e-05
log_E19800_E20100	3.901e-05	1.34e-05	2.909	0.004	1.27e-05	6.53e-05

Omnibus:	47463.812	Durbin-Watson:	2.004
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1187646059.051
Skew:	-28.687	Prob(JB):	0.00
Kurtosis:	1239.657	Cond. No.	4.73e+04

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.73e+04. This might indicate that there are
strong multicollinearity or other numerical problems.

Continuous value imputation

In [14]:

# neg/pos_p_test = observations in test predicted to be neg/pos by mnlogit

neg_p_test = test.copy()[test.copy()['sign_p'] == 'neg']
pos_p_test = test.copy()[test.copy()['sign_p'] == 'pos']

log_neg_predictions = ols_neg_fit.predict(neg_p_test[ols_neg_predictors])
log_pos_predictions = ols_pos_fit.predict(pos_p_test[ols_pos_predictors])

# Transform via e^(Y-hat) + min(P22250) - 1

neg_predictions = np.exp(log_neg_predictions) + puf['P22250'].min() - 1
pos_predictions = np.exp(log_pos_predictions) + puf['P22250'].min() - 1

ols_predictions = pd.concat([neg_predictions, pos_predictions], axis = 0).to_frame()

# NaN rows were not in neg_p_test or pos_pos_test, thus they were predicted by mnlogit to be 0

test['mnlOLS_p'] = ols_predictions
test['mnlOLS_p'] = test['mnlOLS_p'].fillna(0)

Validation¶

Log-loss¶

In [15]:

metrics.log_loss(test['sign'], mnl_pred_proba)

Out[15]:

0.5836617256539257

RMSE¶

In [16]:

(test['P22250'] - test['mnlOLS_p']).pow(2).mean()**0.5

Out[16]:

389051.8348869253

Predicting 0 and mean¶

In [17]:

# First create a small dataset with the columns of interest
test['RF_p'] = rf_predictions
data = test.copy()[['AGIR1', 'P22250', 'RF_p', 'mnlOLS_p']]

What if we just predicted zero?

In [18]:

(data['P22250']).pow(2).mean()**0.5

Out[18]:

385182.5436336457

What if we just predicted the mean? (-3291)

In [19]:

(data['P22250'] - data['P22250'].mean()).pow(2).mean()**0.5

Out[19]:

385168.48227679287

Analysis of predictions¶

Below I will compare the predicted distribution of P22250 over AGI between RF, test (actual), and mnl+OLS.

First, limiting the y axis to +/- 10,000,000. This cuts out only a few predictions/actual observations but makes the whole thing more visible.

In [20]:

sns.set(color_codes=True)
plt.figure(figsize=(15,20))

plt.subplot(311)
plt.title('Random Forests', fontsize = 17)
plt.ylim(-10000000, 10000000)
sns.regplot(data['AGIR1'], data['RF_p'], fit_reg=False)

plt.subplot(312)
plt.title('PUF Test', fontsize = 17)
plt.ylim(-10000000, 10000000)
sns.regplot(data['AGIR1'], data['P22250'], fit_reg=False)

plt.subplot(313)
plt.title('MNLogit + OLS', fontsize = 17)
plt.ylim(-10000000, 10000000)
sns.regplot(data['AGIR1'], data['mnlOLS_p'], fit_reg=False)
plt.plot([0, 35], [750000, 750000], color='green', linestyle='dashed', linewidth=1)
plt.plot([0, 35], [-750000, -750000], color='green', linestyle='dashed', linewidth=1)

plt.suptitle('In +/- 10,000,000', fontsize = 25)

Out[20]:

Text(0.5,0.98,'In +/- 10,000,000')

+/- 750,000

Green lines in mnlogit graph indicate next graph's y-scales

In [21]:

sns.set(color_codes=True)
plt.figure(figsize=(15,20))

plt.subplot(311)
plt.title('Random Forests', fontsize = 17)
plt.ylim(-750000, 750000)
sns.regplot(data['AGIR1'], data['RF_p'], fit_reg=False)

plt.subplot(312)
plt.title('PUF Test', fontsize = 17)
plt.ylim(-750000, 750000)
sns.regplot(data['AGIR1'], data['P22250'], fit_reg=False)

plt.subplot(313)
plt.title('MNLogit + OLS', fontsize = 17)
plt.ylim(-750000, 750000)
sns.regplot(data['AGIR1'], data['mnlOLS_p'], fit_reg=False)
plt.plot([0, 35], [150000, 150000], color='green', linestyle='dashed', linewidth=1)
plt.plot([0, 35], [-200000, -200000], color='green', linestyle='dashed', linewidth=1)

plt.suptitle('In +/- 750,000', fontsize = 25)

Out[21]:

Text(0.5,0.98,'In +/- 750,000')

[-200000, 150000]

In [22]:

sns.set(color_codes=True)
plt.figure(figsize=(15,20))

plt.subplot(311)
plt.title('Random Forests', fontsize = 17)
plt.ylim(-200000, 150000)
sns.regplot(data['AGIR1'], data['RF_p'], fit_reg=False)

plt.subplot(312)
plt.title('PUF Test', fontsize = 17)
plt.ylim(-200000, 150000)
sns.regplot(data['AGIR1'], data['P22250'], fit_reg=False)

plt.subplot(313)
plt.title('MNLogit + OLS', fontsize = 17)
plt.ylim(-200000, 150000)
sns.regplot(data['AGIR1'], data['mnlOLS_p'], fit_reg=False)
plt.plot([0, 35], [10000, 10000], color='green', linestyle='dashed', linewidth=1)
plt.plot([0, 35], [-10000, -10000], color='green', linestyle='dashed', linewidth=1)

plt.suptitle('In [-200000, 150000]', fontsize = 25)

Out[22]:

Text(0.5,0.98,'In [-200000, 150000]')

+/- 10,000

In [23]:

sns.set(color_codes=True)
plt.figure(figsize=(15,20))

plt.subplot(311)
plt.title('Random Forests', fontsize = 17)
plt.ylim(-10000, 10000)
sns.regplot(data['AGIR1'], data['RF_p'], fit_reg=False)

plt.subplot(312)
plt.title('PUF Test', fontsize = 17)
plt.ylim(-10000, 10000)
sns.regplot(data['AGIR1'], data['P22250'], fit_reg=False)

plt.subplot(313)
plt.title('MNLogit + OLS', fontsize = 17)
plt.ylim(-10000, 10000)
sns.regplot(data['AGIR1'], data['mnlOLS_p'], fit_reg=False)

plt.suptitle('In +/- 10,000', fontsize = 25)

Out[23]:

Text(0.5,0.98,'In +/- 10,000')