In [1]:

%matplotlib inline
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import random
import csv
import scipy as sp

from statsmodels.graphics import gofplots
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn import preprocessing, compose
from sklearn import model_selection
from sklearn import metrics

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from hyperopt import STATUS_OK, hp, tpe, fmin, Trials
from hyperopt.pyll.stochastic import sample
from hyperopt.pyll.base import scope

from tqdm import tqdm_notebook

sns.set_context("notebook")

Intro: Food Environment Atlas¶

The FEA is a publicly available dataset compiled by the US Department of Agriculture. According to the website,

Food environment factors—such as store/restaurant proximity, food prices, food and nutrition assistance programs, and community characteristics—interact to influence food choices and diet quality. These interactions are complex and more research is needed to identify causal relationships and effective policy interventions.

The objectives of the Atlas are:

to assemble statistics on food environment indicators to stimulate research on the determinants of food choices and diet quality, and

to provide a spatial overview of a community's ability to access healthy food and its success in doing so."

This is a compilation of over 275 food environment indicators for every U.S. county. The data were collected from a variety of sources between the years 2007 and 2016.

The objective is to predict the adult obesity rate for 2013 (see the documentation for more details) based on the available data.

Table of Contents¶

Miscellaneous Functions
Acquiring the Data
2.1. Consolidating the Data
2.2. Formatting the Data and Performing Quality Checks
Exploratory Analysis
Additional Preparations
4.1. Feature Selection
4.2. Exploring Collinearity
Training the Models
5.1. Baseline Model Performance
5.2. Hyperparameter Tuning
5.3. Learning Curves
5.4. Validation Curves
Predicting the Obesity Rates
Understanding the Results
7.1. Where the Model Performs Poorly
7.2. Identifying Feature Importance

Misc. Functions¶

In [2]:

def adjusted_r2(y_true, y_pred, n, k):

    """
    Computes the adjusted R^2 score for regression.
    
    Arguments
    ---------
        y_true: target variable value
        y_pred: predicted target variable value from model
        n: number of observations (rows)
        k: number of features (columns)
        
    Keywords
    --------
        None
        
    Returns
    -------
        r2_adj: adjusted R^2 score
    """
    
    from sklearn import metrics
    
    r2_adj = 1.0 - (1.0 - metrics.r2_score(y_true, y_pred))*(n-1)/(n-k-1)
    
    return r2_adj

In [3]:

def convert2log(label, X_orig, drop_orig=True, replacezero=0.0):

    """
    Replaces a column in a pandas dataframe with log10 of that value.
    
    Arguments
    ---------
        label: feature label of the column to be log10'd
        X: pandas dataframe
        
    Keywords
    --------
        drop_orig: if True, discards the original column after creating its log10 version
        
    Returns
    -------
        X: pandas dataframe
    """
    
    try:
        X = X_orig.copy()
        X[label] = X[label].apply(lambda x: replacezero if x == 0 else x)
        X['LOG10_'+label] = X[label].apply(np.log10)
        if drop_orig:
            X = X.drop(axis=1, labels=[label])
    except (ValueError, AttributeError, TypeError) as e:
        print("Cannot take log10 of this data type.")

    return X

In [4]:

def variable_lookup(label):
    
    """
    Looks up and prints the variable definition in the lookup table.
    
    Arguments
    ---------
        label: feature label
    
    Keywords
    --------
        None
        
    Returns
    -------
        None
    """
    
    lookup_table = pd.read_csv("/Users/jieunchoi/DataScience/food_environment_atlas/csv_files/meta/Variable List-Table 1.csv")
    try:
        print(label+': ',lookup_table[lookup_table['Variable Code'] == label]['Variable Name'].values[0])
    except NameError:
        print(label+" is not in the lookup table.")

In [5]:

def convert_state_names(n, abbr=False):
    
    """
    Converts between the state abbreviation and the full name. 
    List taken from https://gist.github.com/rogerallen/1583593.
    
    Arguments
    ---------
        n: state name in either the abbreviated or fully-spelled out format
    
    Keywords
    --------
        abbr: input is in the abbreviated format
        
    Returns
    -------
        result: state name in either the abbreviated or fully-spelled out format
    """

    us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    }
    
    try:
        if abbr:
            result = [x for x in us_state_abbrev.keys() if us_state_abbrev[x] == n][0]
            return(result)
        else:
            return(us_state_abbrev[n])
    except (KeyError, IndexError) as e:
        print('Cannot convert the name. Check the spelling and/or the keyword option.')

Acquiring the Data¶

Create one big data frame¶

Merge the individual tables to make one master dataframe.

In [6]:

df_list = glob.glob('csv_files/*csv')
for i, df_file in enumerate(df_list):
    print(i, df_file.split('files/')[-1].split('-Table')[0])
    df = pd.read_csv(df_file)
    df.rename(columns=lambda x: x.upper().strip(), inplace=True)
    if i == 0:
        master_df = df.copy()
    else:
        df.drop(columns=['STATE', 'COUNTY'], inplace=True)
        master_df = master_df.merge(df, on=['FIPS'], how='left')

0 SOCIOECONOMIC
1 INSECURITY
2 ASSISTANCE
3 ACCESS
4 HEALTH
5 PRICES_TAXES
6 RESTAURANTS
7 STORES
8 LOCAL

In [7]:

master_df.head()

Out[7]:

	FIPS	STATE	COUNTY	PCT_NHWHITE10	PCT_NHBLACK10	PCT_HISP10	PCT_NHASIAN10	PCT_NHNA10	PCT_NHPI10	PCT_65OLDER10	...	CSA12	PCH_CSA_07_12	AGRITRSM_OPS07	AGRITRSM_OPS12	PCH_AGRITRSM_OPS_07_12	AGRITRSM_RCT07	AGRITRSM_RCT12	PCH_AGRITRSM_RCT_07_12	FARM_TO_SCHOOL13
0	1001	AL	Autauga	77.246156	17.582599	2.400542	0.855766	0.397647	0.040314	11.995382	...	3.0	50.000000	7.0	10.0	42.857143	228000.0	146000.0	-35.964912	NaN
1	1003	AL	Baldwin	83.504787	9.308425	4.384824	0.735193	0.628755	0.043343	16.771185	...	7.0	-46.153846	18.0	16.0	-11.111111	124000.0	204000.0	64.516129	0.0
2	1005	AL	Barbour	46.753105	46.691190	5.051535	0.389700	0.218524	0.087409	14.236807	...	0.0	-100.000000	27.0	32.0	18.518519	163000.0	304000.0	86.503067	1.0
3	1007	AL	Bibb	75.020729	21.924504	1.771765	0.096007	0.279293	0.030548	12.681650	...	3.0	50.000000	5.0	6.0	20.000000	NaN	21000.0	NaN	0.0
4	1009	AL	Blount	88.887338	1.263040	8.070200	0.200621	0.497191	0.031402	14.722096	...	4.0	-42.857143	10.0	8.0	-20.000000	293000.0	30000.0	-89.761092	1.0

5 rows × 280 columns

Data types are all numbers except for two, STATE and COUNTY.

In [8]:

master_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3143 entries, 0 to 3142
Columns: 280 entries, FIPS to FARM_TO_SCHOOL13
dtypes: float64(248), int64(30), object(2)
memory usage: 6.7+ MB

Add population data. We need to work a little bit on the column names in order to merge this df seamlessly onto the master data frame.

In [9]:

df_county = pd.read_csv("csv_files/meta/Supplemental Data - County-Table 1.csv")
df_county.dropna(axis=0, how="any", inplace=True)

df_county.rename(columns=lambda x: "_".join(x.upper().\
                      replace(",","").replace("POPULATION","POP").replace("ESTIMATE","EST").strip().split()), inplace=True)

col_tmp = df_county.columns.drop(['FIPS', 'STATE', 'COUNTY'])
df_county[col_tmp] = df_county[col_tmp].apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',','')))
df_county[col_tmp] = df_county[col_tmp].apply(pd.to_numeric).astype('float64')

In [10]:

df_county.drop(columns=['STATE', 'COUNTY'], inplace=True)
master_df = master_df.merge(df_county, on=['FIPS'], how='left')

In [11]:

master_df.info()
master_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3143 entries, 0 to 3142
Columns: 287 entries, FIPS to POP_EST_2016
dtypes: float64(255), int64(30), object(2)
memory usage: 6.9+ MB

Out[11]:

	FIPS	STATE	COUNTY	PCT_NHWHITE10	PCT_NHBLACK10	PCT_HISP10	PCT_NHASIAN10	PCT_NHNA10	PCT_NHPI10	PCT_65OLDER10	...	PCH_AGRITRSM_RCT_07_12	FARM_TO_SCHOOL13	2010_CENSUS_POP	POP_EST_2011	POP_EST_2012	POP_EST_2013	POP_EST_2014	POP_EST_2015	POP_EST_2016
0	1001	AL	Autauga	77.246156	17.582599	2.400542	0.855766	0.397647	0.040314	11.995382	...	-35.964912	NaN	54571.0	55255.0	55027.0	54792.0	54977.0	55035.0	55416.0
1	1003	AL	Baldwin	83.504787	9.308425	4.384824	0.735193	0.628755	0.043343	16.771185	...	64.516129	0.0	182265.0	186653.0	190403.0	195147.0	199745.0	203690.0	208563.0
2	1005	AL	Barbour	46.753105	46.691190	5.051535	0.389700	0.218524	0.087409	14.236807	...	86.503067	1.0	27457.0	27326.0	27132.0	26938.0	26763.0	26270.0	25965.0
3	1007	AL	Bibb	75.020729	21.924504	1.771765	0.096007	0.279293	0.030548	12.681650	...	NaN	0.0	22915.0	22736.0	22645.0	22501.0	22511.0	22561.0	22643.0
4	1009	AL	Blount	88.887338	1.263040	8.070200	0.200621	0.497191	0.031402	14.722096	...	-89.761092	1.0	57322.0	57707.0	57772.0	57746.0	57621.0	57676.0	57704.0

5 rows × 287 columns

Let's add some relevant columns from the 2013 US Census data since the main dataset only contains poverty rate and median income for 2015.

In [12]:

poverty_income_df_unf = pd.read_csv('csv_files/census/est13all.csv')

Drop empty header rows and reset the index. Select only the columns we want to append to the master data frame.

In [13]:

poverty_income_df = poverty_income_df_unf.drop(labels=[0,1,2,3], axis=0).copy()
poverty_income_df.columns = poverty_income_df_unf.iloc[2]
poverty_income_df = poverty_income_df.reset_index(drop=True)

In [14]:

poverty_income_df = poverty_income_df[['State FIPS Code', 'County FIPS Code', \
                                       'Postal Code', 'Poverty Percent, All Ages', \
                                       'Median Household Income', 'Poverty Percent, Age 0-17']]

poverty_income_df['Median Household Income'] = poverty_income_df['Median Household Income'].apply(lambda x: x.replace(",",""))

Rename columns, convert to the appropriate data types, and drop rows containing overall state data.

In [15]:

poverty_income_cols = poverty_income_df.columns.drop(labels=['State FIPS Code', 'County FIPS Code', 'Postal Code'])
poverty_income_df[poverty_income_cols] = poverty_income_df[poverty_income_cols].apply(pd.to_numeric, errors='coerce')

poverty_income_df = poverty_income_df.drop(poverty_income_df[poverty_income_df['County FIPS Code'] == '000'].index)
poverty_income_df.columns = ['STATE_FIPS', 'COUNTY_FIPS', \
                                       'STATE', 'POVRATE13_ALL', \
                                       'MEDHHINC13', 'POVRATE13_CHILD']

poverty_income_df['FIPS'] = poverty_income_df.apply(lambda row: int(row.STATE_FIPS)*1000+int(row.COUNTY_FIPS), axis=1)
poverty_income_df = poverty_income_df.drop(columns=['STATE_FIPS', 'COUNTY_FIPS', 'STATE'])

Merge with the main data frame.

In [16]:

master_df = master_df.merge(poverty_income_df, on=['FIPS'], how='left')
master_df.info()
master_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3143 entries, 0 to 3142
Columns: 290 entries, FIPS to POVRATE13_CHILD
dtypes: float64(258), int64(30), object(2)
memory usage: 7.0+ MB

Out[16]:

	FIPS	STATE	COUNTY	PCT_NHWHITE10	PCT_NHBLACK10	PCT_HISP10	PCT_NHASIAN10	PCT_NHNA10	PCT_NHPI10	PCT_65OLDER10	...	2010_CENSUS_POP	POP_EST_2011	POP_EST_2012	POP_EST_2013	POP_EST_2014	POP_EST_2015	POP_EST_2016	POVRATE13_ALL	MEDHHINC13	POVRATE13_CHILD
0	1001	AL	Autauga	77.246156	17.582599	2.400542	0.855766	0.397647	0.040314	11.995382	...	54571.0	55255.0	55027.0	54792.0	54977.0	55035.0	55416.0	13.5	51868.0	19.6
1	1003	AL	Baldwin	83.504787	9.308425	4.384824	0.735193	0.628755	0.043343	16.771185	...	182265.0	186653.0	190403.0	195147.0	199745.0	203690.0	208563.0	14.2	47539.0	20.8
2	1005	AL	Barbour	46.753105	46.691190	5.051535	0.389700	0.218524	0.087409	14.236807	...	27457.0	27326.0	27132.0	26938.0	26763.0	26270.0	25965.0	28.2	30981.0	42.4
3	1007	AL	Bibb	75.020729	21.924504	1.771765	0.096007	0.279293	0.030548	12.681650	...	22915.0	22736.0	22645.0	22501.0	22511.0	22561.0	22643.0	23.1	39781.0	35.6
4	1009	AL	Blount	88.887338	1.263040	8.070200	0.200621	0.497191	0.031402	14.722096	...	57322.0	57707.0	57772.0	57746.0	57621.0	57676.0	57704.0	17.2	44392.0	26.6

5 rows × 290 columns

Format and check the data¶

Inspect the variable list to figure out which data columns should be dropped. We want to discard data columns that are redundant (e.g., % change given values from two years' worth of data, which are already included as part of the data set) or non-normalized to the population (e.g., counts). Let's also discard data from 2014 and beyond as we'll make predictions for 2013.

In [17]:

variable_df = pd.read_csv('/Users/jieunchoi/DataScience/food_environment_atlas/csv_files/meta/Variable List-Table 1.csv')
print(np.unique(variable_df.Units))

['# per 1,000 pop' '% change' '1,000 dollars' 'Acres' 'Acres/1,000 pop'
 'Classification' 'Count' 'Dollars' 'Dollars/capita' 'Dollars/store'
 'Legend' 'Percent' 'Percentage points' 'Ratio' 'Sq ft' 'Sq ft/1,000 pop']

In [18]:

data13 = [x for x in variable_df['Variable Code'] if int(x[-2:]) > 13]
badunits = ['% change', 'Sq ft', 'Percentage points', '1,000 dollars', 'Count', 'Dollars']

bad_variable_df = variable_df[(variable_df['Units'].isin(badunits))|(variable_df['Variable Code'].isin(data13))]
good_variable_df = variable_df[(~variable_df['Units'].isin(badunits))&(~variable_df['Variable Code'].isin(data13))]

Use the selected column names to filter the master data frame. Also drop >2013 columns added from the supplemental table.

In [19]:

master_df = master_df.drop(columns=bad_variable_df['Variable Code'].tolist()+['POP_EST_2014','POP_EST_2015','POP_EST_2016'])

Change the data type of categorical variables. Also there's one column that is an integer column, which we can safely change to a float just for uniformity.

In [20]:

cat_vars = good_variable_df['Variable Code'][(good_variable_df.Units == 'Classification')|(good_variable_df.Units == 'Legend')]
master_df[cat_vars] = master_df[cat_vars].astype("category")

master_df['FIPS'] = master_df['FIPS'].astype("object")
master_df[master_df.columns[master_df.dtypes == int]] = master_df[master_df.columns[master_df.dtypes == int]].astype(float)

In [21]:

master_df.head()

Out[21]:

	FIPS	STATE	COUNTY	PCT_NHWHITE10	PCT_NHBLACK10	PCT_HISP10	PCT_NHASIAN10	PCT_NHNA10	PCT_NHPI10	PCT_65OLDER10	...	GHVEG_SQFTPTH12	FARM_TO_SCHOOL13	2010_CENSUS_POP	POP_EST_2011	POP_EST_2012	POP_EST_2013	POVRATE13_ALL	MEDHHINC13	POVRATE13_CHILD
0	1001	AL	Autauga	77.246156	17.582599	2.400542	0.855766	0.397647	0.040314	11.995382	...	0.000000	NaN	54571.0	55255.0	55027.0	54792.0	13.5	51868.0	19.6
1	1003	AL	Baldwin	83.504787	9.308425	4.384824	0.735193	0.628755	0.043343	16.771185	...	326.477447	0.0	182265.0	186653.0	190403.0	195147.0	14.2	47539.0	20.8
2	1005	AL	Barbour	46.753105	46.691190	5.051535	0.389700	0.218524	0.087409	14.236807	...	0.000000	1.0	27457.0	27326.0	27132.0	26938.0	28.2	30981.0	42.4
3	1007	AL	Bibb	75.020729	21.924504	1.771765	0.096007	0.279293	0.030548	12.681650	...	NaN	0.0	22915.0	22736.0	22645.0	22501.0	23.1	39781.0	35.6
4	1009	AL	Blount	88.887338	1.263040	8.070200	0.200621	0.497191	0.031402	14.722096	...	0.000000	1.0	57322.0	57707.0	57772.0	57746.0	17.2	44392.0	26.6

5 rows × 98 columns

Spot check the number of counties attributed to each state. GA has 159 counties compared to CA's 58! Remarkable.

In [22]:

master_df = master_df.drop_duplicates(['FIPS'])
state_county_df = master_df[['STATE', 'COUNTY']].groupby(by='STATE').count()
display(state_county_df.head(n=15))

	COUNTY
STATE
AK	29
AL	67
AR	75
AZ	15
CA	58
CO	64
CT	8
DC	1
DE	3
FL	67
GA	159
HI	5
IA	99
ID	44
IL	102

Perform additional basic checks for problematic data.

There are some feature columns (float) that are mostly 0. For instance, there are some counties with no Pacific Islander population and no recreation & fitness facilities. We will drop columns with 10% of the data = 0.

In [23]:

for c in master_df.columns[master_df.dtypes == float]:
    num_zeros = len(master_df[c][master_df[c] == 0])
    num_rows = master_df.shape[0]
    if num_zeros > num_rows*0.1:
        print(c, num_zeros)
        master_df.drop(columns=c, inplace=True)

PCT_NHPI10 485
RECFACPTH09 893
SUPERCPTH09 1489
SPECSPTH09 1140
FMRKTPTH09 1362
ORCHARD_ACRES07 360
ORCHARD_ACRES12 351
ORCHARD_ACRESPTH07 360
ORCHARD_ACRESPTH12 350
BERRY_ACRES07 844
BERRY_ACRES12 742
BERRY_ACRESPTH07 844
BERRY_ACRESPTH12 741
GHVEG_SQFTPTH07 1879
GHVEG_SQFTPTH12 1412

Next, we will examine missing data. Most of the columns that are missing >100 rows seem hyperspecific and understandably irrelevant to many counties, for example FRESHVEG_ACRES07, the number of acres of vegetables harvested for fresh marke in 2007. For simplicity, let's drop the columns with >300 rows of missing data. This will remove mainly agriculture/farm-related columns.

In [24]:

count_null_values = master_df.isnull().sum().sort_values(ascending=False)
print('There are',np.shape(master_df)[0],'rows total. The following columns are missing >100 rows.')
missing_gt100 = count_null_values[count_null_values > 100].copy()
print(missing_gt100)

missing_gt300 = count_null_values[(count_null_values > 300)].copy()
print('Dropping', len(missing_gt300),'columns with >300 missing rows.')
master_df = master_df.drop(labels=missing_gt300.index, axis=1)

There are 3143 rows total. The following columns are missing >100 rows.
FRESHVEG_ACRESPTH07    1300
FRESHVEG_ACRES07       1300
FRESHVEG_ACRESPTH12    1227
FRESHVEG_ACRES12       1226
REDEMP_WICS12          1139
PC_WIC_REDEMP12        1139
REDEMP_WICS08          1064
PC_WIC_REDEMP08        1064
VEG_ACRESPTH12          612
VEG_ACRESPTH07          611
VEG_ACRES07             611
VEG_ACRES12             562
PCT_LOCLSALE07          346
PC_DIRSALES07           288
PCT_LOCLSALE12          287
PC_DIRSALES12           243
REDEMP_SNAPS12          242
FARM_TO_SCHOOL13        208
dtype: int64
Dropping 13 columns with >300 missing rows.

Let's deal with the missing data via imputing. We will fill in missing rows using the median value for each state. For columns that are missing data for entire states, we will enter the national median value.

In [25]:

cols_w_null_values = master_df.columns[master_df.isnull().any()]
for c in cols_w_null_values:
    try:
        master_df[c] = master_df.groupby('STATE')[c].apply(lambda x: x.fillna(x.median()))
    except TypeError:
        #for categorical variables, fill with the most frequent value, which is returned first in value_counts
        master_df[c] = master_df.groupby('STATE')[c].apply(lambda x: x.fillna(x.value_counts().index[0]))

/Users/jieunchoi/anaconda/envs/myenv/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1076: RuntimeWarning: Mean of empty slice
  return np.nanmean(a, axis, out=out, keepdims=keepdims)

In [26]:

cols_w_null_values = master_df.columns[master_df.isnull().any()]
for c in cols_w_null_values:
    try:
        master_df[c] = master_df[c].fillna(master_df[c].median())
    except TypeError:
        master_df[c] = master_df[c].fillna(master_df[c].value_counts().index[0])

In [27]:

print('Any remaining missing values?: ', master_df.isnull().values.any())
print('There are a total of', np.shape(master_df)[0], 'counties (rows) and', np.shape(master_df)[1], 'features (columns).')

Any remaining missing values?:  False
There are a total of 3143 counties (rows) and 70 features (columns).

In [28]:

master_df.head()

Out[28]:

	FIPS	STATE	COUNTY	PCT_NHWHITE10	PCT_NHBLACK10	PCT_HISP10	PCT_NHASIAN10	PCT_NHNA10	PCT_65OLDER10	PCT_18YOUNGER10	...	PC_DIRSALES12	FARM_TO_SCHOOL13	2010_CENSUS_POP	POP_EST_2011	POP_EST_2012	POP_EST_2013	POVRATE13_ALL	MEDHHINC13	POVRATE13_CHILD
0	1001	AL	Autauga	77.246156	17.582599	2.400542	0.855766	0.397647	11.995382	26.777959	...	5.582238	0.0	54571.0	55255.0	55027.0	54792.0	13.5	51868.0	19.6
1	1003	AL	Baldwin	83.504787	9.308425	4.384824	0.735193	0.628755	16.771185	22.987408	...	3.403433	0.0	182265.0	186653.0	190403.0	195147.0	14.2	47539.0	20.8
2	1005	AL	Barbour	46.753105	46.691190	5.051535	0.389700	0.218524	14.236807	21.906982	...	0.478663	1.0	27457.0	27326.0	27132.0	26938.0	28.2	30981.0	42.4
3	1007	AL	Bibb	75.020729	21.924504	1.771765	0.096007	0.279293	12.681650	22.696923	...	0.883314	0.0	22915.0	22736.0	22645.0	22501.0	23.1	39781.0	35.6
4	1009	AL	Blount	88.887338	1.263040	8.070200	0.200621	0.497191	14.722096	24.608353	...	8.567571	1.0	57322.0	57707.0	57772.0	57746.0	17.2	44392.0	26.6

5 rows × 70 columns

Exploratory Analysis¶

Let's begin by exploring the distributions of some parameters. First, our target variable: adult obesity rate is the percent of the population with age > 20 and BMI > 30.

In [29]:

s = sns.distplot(master_df['PCT_OBESE_ADULTS13'].dropna())
_ = s.set_xlabel('2013 Obesity Rate [%]')
_ = s.set_ylabel('pdf')

Just as an aside, construct a QQ plot and run the Shapiro-Wilk test to test normality ( $H_0$ =normal distribution). We see that these two tests rule out normality for our obesity rate distribution.

For linear regression: normality in the dependent variable is not a required assumption. Rather, the assumption is that the residual errors—any variation not explained by the features—are normally distributed. Some references say that residual normality does not need to be. satisfied as the sample size increases (CLT)

In [30]:

f, ax = plt.subplots(1,1,figsize=(5,5))
fig = gofplots.qqplot(master_df['PCT_OBESE_ADULTS13'], line='s', ax=ax, alpha=0.2, c='#1f77b4')

st, p = sp.stats.shapiro(master_df['PCT_OBESE_ADULTS13'])
print('Shapiro-Wilk normality test: W={:6.5f}, p={:6.3e}'.format(st, p))

Shapiro-Wilk normality test: W=0.99042, p=1.075e-13

Let's look at some other parameter distributions. There's a pretty remarkable spread in the population size of the county (also, look how log-symmetric-ish!). Adult diabetes rate is the percentage of the population with age > 20 and diabetes (e.g., types 1 and 2, and excluding gestational diabetes). Poverty rate is the percentage of the population with income below the poverty threshold, which varies by family size, age of the head of the household, etc. For 2017, it was approximately $12,500 for an individual and $25,100 for a family of 4. Low access to store is defined as the percentage of the population living > 1 mile (> 10 miles) from the nearest supermarket or grocery store in an urban (rural) area. The last distribution is strongly skewed to higher % of low access to store.

In [31]:

f, axes = plt.subplots(1,4,figsize=(15,4))
s = sns.distplot(np.log10(master_df['2010_CENSUS_POP'].dropna()), ax=axes[0])
s.set_xlabel('log(2010 Census Population)')
s = sns.distplot(master_df['PCT_DIABETES_ADULTS13'].dropna(), ax=axes[1])
s.set_xlabel('2013 Adult Diabetes Rate [%]')
s = sns.distplot(master_df['POVRATE13_ALL'].dropna(), ax=axes[2])
s.set_xlabel('2013 All Ages Poverty Rate [%]')
s = sns.distplot(master_df['PCT_LACCESS_POP10'].dropna(), ax=axes[3])
s.set_xlabel('2010 Low Access to Store [%]')
f.tight_layout()

In [32]:

min_pop_ind = master_df['2010_CENSUS_POP'].dropna().idxmin()
max_pop_ind = master_df['2010_CENSUS_POP'].dropna().idxmax()

print('According to the 2010 Census, county population ranges from ',
      master_df['COUNTY'][min_pop_ind], master_df['STATE'][min_pop_ind],':',
      int(min(master_df['2010_CENSUS_POP'].dropna())),
        'to', 
      master_df['COUNTY'][max_pop_ind], master_df['STATE'][max_pop_ind],':',
      int(max(master_df['2010_CENSUS_POP'].dropna())))

According to the 2010 Census, county population ranges from  Loving TX : 82 to Los Angeles CA : 9818605

Let's briefly examine the counties that have 100% of their population experiencing low access to store to understand the extreme values. After spot-checking a few counties along with a cross-reference Google map search, it seems that what appear to be regular grocery stores do exist near residential areas in these counties. Some guesses for why these counties are labeled to have low access to grocery stores for ~100% of their population are 1. these grocery stores do not meet the somewhat strict definition of "supermarket/large grocery store," e.g., more than $2 million in annual sales, 2. population data were not properly allocated to their 0.5 sq. km grids. The 100% designation does appear to be correct for some of these counties (e.g., for Kiowa, CO, the nearest large grocery store, Safeway, is ~8 miles from town center) while for others, it is more ambiguous. Thus we will proceed without attempting to correct or remove these potential outliers, especially given their small numbers.

In [33]:

master_df[['STATE', 'COUNTY', 'METRO13', 'PCT_LACCESS_POP10']][master_df['PCT_LACCESS_POP10']>99.99].head()

Out[33]:

	STATE	COUNTY	PCT_LACCESS_POP10
72	AK	Denali	100.000001
81	AK	Lake and Peninsula	100.000000
95	AK	Yukon-Koyukuk	100.000000
271	CO	Hinsdale	99.999777
275	CO	Kiowa	100.000000

We can also examine the prevalence of adult obesity broken down separately for metro/nonmetro counties and counties with persistent poverty (poverty rate > 20% in the 1980, 1990, and 2000 censuses and the American Community Survey 5-year estimates for 2007-2011).

In [34]:

g = sns.FacetGrid(master_df, row='METRO13', col='PERPOV10', height=4)
g.map(sns.distplot, 'PCT_OBESE_ADULTS13').set_axis_labels("2013 Obesity Rate [%]", "pdf")
axes = g.axes.flatten()
axes[0].set_title("Non-Metro/No Persistent Poverty")
axes[1].set_title("Non-Metro/Persistent Poverty")
axes[2].set_title("Metro/No Persistent Poverty")
axes[3].set_title("Metro/Persistent Poverty")

Out[34]:

Text(0.5, 1.0, 'Metro/Persistent Poverty')

Let's now move onto more explicitly examining relationships between obesity and the available parameters.

The positive trend between poverty and obesity rate is not too surprising if we are to believe that poverty is associated with poor food choices, which is in turn associated with poor health/obesity. There is a large spread in the obesity rate values as a function of average milk price, though there may be a weak non-linear relationship. There isn't a super obvious, strong correlation between the prevalence of FF restaurants and obesity rates. Based on studies such as this, we know that obesity disproportionately affects African-Americans and Mexican-Americans. This is confirmed by the data (for the African-American population at least, as separate Mexican-American data is not available).

In [35]:

f, axes = plt.subplots(1,4,figsize=(15,4))
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='POVRATE13_ALL', data=master_df, ax=axes[0], s=5)
s.set_xlabel('2013 All Ages Poverty Rate [%]')
s.set_ylabel('2013 Adult Obesity [%]')
s.set_xlim(0,50)
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='MILK_PRICE10', data=master_df, ax=axes[1], s=5)
s.set_xlabel('2010 Milk Price/National Average')
s.set_ylabel('2013 Adult Obesity [%]')
s.set_xlim(0.7,1.5)
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='FFRPTH09', data=master_df, ax=axes[2], s=5)
s.set_xlabel('2009 Fast-Food Restaurants/1000')
s.set_xlim(0,1.5)
s.set_ylabel('2013 Adult Obesity [%]')
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='PCT_NHBLACK10', data=master_df, ax=axes[3], s=5, legend='full')
s.set_xlabel('2010 Black & African/American [%]')
s.set_ylabel('2013 Adult Obesity [%]')
s.set_xlim(0,90)

f.tight_layout()

We can examine the same distributions broken up by race (above/below the national average value according to the 2010 US. Census). It's interesting that obesity and diabetes rate distributions for counties with the fraction of the black/African-American population exceeding vs. below the national average (12.6%) show visible differences. This does not hold true for counties with fraction of the white population exceeding/below the national average (72.4%).

In [36]:

small_df = master_df[['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13', 'PCT_NHBLACK10']].copy()
small_df['BLACK10_ABVNATAVG'] = small_df.apply(lambda x: int(x['PCT_NHBLACK10'] > 12.6), axis=1)

sns.pairplot(small_df, vars=['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13'], hue='BLACK10_ABVNATAVG')

Out[36]:

<seaborn.axisgrid.PairGrid at 0x117e5ec50>

In [37]:

small_df = master_df[['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13', 'PCT_NHWHITE10']].copy()
small_df['WHITE10_ABVNATAVG'] = small_df.apply(lambda x: int(x['PCT_NHWHITE10'] > 72.4), axis=1)

sns.pairplot(small_df, vars=['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13'], hue='WHITE10_ABVNATAVG')

Out[37]:

<seaborn.axisgrid.PairGrid at 0x117e75898>

Next, although it may seem totally trivial, let's inspect poverty rate vs. median household income. As expected, there's a negative trend, but the exact form of this scatterplot is pretty interesting. The steep rise below the poverty rate of 10% implies that even in low-poverty areas, there's actually a large variability in the overall affluence of the population (ranging from ~40k to over 100k in median household income). Similarly, the long tail in the poverty rate beyond 10% implies that there are issues beyond the median income at play (cost of living is one example). At a fixed poverty rate, counties with higher % of white population seems to have higher median household incomes.

In [38]:

s = sns.scatterplot(x='POVRATE13_ALL', y='MEDHHINC13', hue='PCT_NHWHITE10', palette='Blues', hue_norm=(0,90), data=master_df, s=8)
legend = s.legend()
legend.texts[0].set_text("2010 Non-Hispanic White [%]")
_ = s.set_xlabel('2013 All Ages Poverty Rate [%]')
_ = s.set_ylabel('2013 Median Household Income [$]')

Just for fun, let's select a random subset of the counties and inspect the breakdown in race.

In [39]:

total_rows = np.shape(master_df)[1]
rand_ind = random.sample(range(1,287), 6)

plt.figure(figsize=(15,2.5))
race_df = master_df.iloc[rand_ind,:][['FIPS', 'STATE', 'COUNTY', 'PCT_NHWHITE10', 'PCT_NHBLACK10', 'PCT_HISP10', 'PCT_NHASIAN10', 'PCT_NHNA10']].copy()

race_df['COUNTY_STATE'] = race_df['COUNTY']+', '+race_df['STATE']
race_df = race_df.drop(columns=['FIPS', 'STATE', 'COUNTY'])

race_column_list = race_df.columns.drop('COUNTY_STATE')
race_df = race_df.rename(columns=lambda x: x.split('PCT_')[1].split('10')[0] if x in race_column_list else x)
race_column_list = race_df.columns.drop('COUNTY_STATE').drop('HISP')
race_df = race_df.rename(columns=lambda x: x.split('NH')[1] if x in race_column_list else x)

race_df = pd.melt(race_df, id_vars=['COUNTY_STATE'])
s = sns.barplot(x='COUNTY_STATE', y='value', hue='variable', data=race_df)
leg = plt.legend(bbox_to_anchor=(1.01, 1), loc=2, frameon=False)

s.set_xlabel('County, State')
s.set_ylabel('2010 Population [%]') 

Out[39]:

Text(0, 0.5, '2010 Population [%]')

Get a list of features that have outliers as determined by the IQR score for reference later. Outliers are understandable and even expected for certain features like ethnicity and population.

In [40]:

for c in master_df.columns[master_df.dtypes == float]:
    q1 = master_df[c].quantile(0.25)
    q3 = master_df[c].quantile(0.75)
    iqr = q3-q1
    n_potential_outliers = sum((master_df[c] < (q1-1.5*iqr))|(master_df[c] > (q3+1.5*iqr)))
    if n_potential_outliers > 50:
        print('{:25}'.format(c), sum((master_df[c] < (q1-1.5*iqr))|(master_df[c] > (q3+1.5*iqr))))

PCT_NHWHITE10             69
PCT_NHBLACK10             420
PCT_HISP10                378
PCT_NHASIAN10             391
PCT_NHNA10                442
PCT_65OLDER10             70
PCT_18YOUNGER10           125
PC_SNAPBEN10              65
PCT_FREE_LUNCH09          58
PCT_REDUCED_LUNCH09       104
PCT_SFSP09                341
PCT_WIC09                 159
PCT_CACFP09               146
PCT_LACCESS_POP10         175
PCT_LACCESS_LOWI10        223
PCT_LACCESS_HHNV10        175
PCT_LACCESS_CHILD10       176
PCT_LACCESS_SENIORS10     240
PCT_OBESE_ADULTS08        191
PCT_OBESE_ADULTS13        79
FFRPTH09                  71
PC_FSRSALES12             107
GROCPTH09                 242
CONVSPTH09                104
SNAPSPTH12                83
WICSPTH08                 234
WICSPTH12                 233
PCT_LOCLFARM07            237
PCT_LOCLFARM12            224
PCT_LOCLSALE12            423
PC_DIRSALES07             236
PC_DIRSALES12             261
2010_CENSUS_POP           415
POP_EST_2011              418
POP_EST_2012              417
POP_EST_2013              416
POVRATE13_ALL             67
MEDHHINC13                116

Additional Prepwork¶

We have to do a few more things before we can throw the data at ML algorithms.

We will rename the data to X following the general convention, then split into training and test sets. We'll remove 'PCT_DIABETES_ADULTS' columns because Type II diabetes is strongly associated with obesity ("People who are obese -- more than 20% over their ideal body weight for their height -- are at particularly high risk of developing type 2 diabetes and its related medical problems.") and we are interested in obesity/diabetes as predictions of food environment factors.

In [41]:

X = master_df.drop(axis=1, labels=['FIPS', 'STATE', 'COUNTY', 'PCT_OBESE_ADULTS08', 'PCT_OBESE_ADULTS13', \
                                   'PCT_DIABETES_ADULTS08', 'PCT_DIABETES_ADULTS13'])
y = master_df['PCT_OBESE_ADULTS13']

In [42]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3)

The features that will be used to predict the obesity rate are listed below. Many of these features appear to be various representations/encodings of poverty/socioeconomic status.

In [43]:

for t in X_train.columns:
    try:
        variable_lookup(t)
    except:
        print(t)

PCT_NHWHITE10:  % White, 2010
PCT_NHBLACK10:  % Black, 2010
PCT_HISP10:  % Hispanic, 2010
PCT_NHASIAN10:  % Asian, 2010
PCT_NHNA10:  % American Indian or Alaska Native, 2010
PCT_65OLDER10:  % Population 65 years or older, 2010
PCT_18YOUNGER10:  % Population under age 18, 2010
PERPOV10:  Persistent-poverty counties, 2010
PERCHLDPOV10:  Persistent-child-poverty counties, 2010
METRO13:  Metro/nonmetro counties, 2010
POPLOSS10:  Population-loss counties, 2010
FOODINSEC_10_12:  Household food insecurity (%, three-year average), 2010-12*
VLFOODSEC_10_12:  Household very low food security (%, three-year average), 2010-12*
FOODINSEC_CHILD_01_07:  Child food insecurity (% households, multiple-year average), 2001-07*
FOODINSEC_CHILD_03_11:  Child food insecurity (% households, multiple-year average), 2003-11*
REDEMP_SNAPS12:  SNAP redemptions/SNAP-authorized stores, 2012
PCT_SNAP12:  SNAP participants (% pop), 2012*
PC_SNAPBEN10:  SNAP benefits per capita, 2010
SNAP_PART_RATE08:  SNAP participants (% eligible pop), 2008*
SNAP_PART_RATE13:  SNAP participants (% eligible pop), 2013*
SNAP_OAPP09:  SNAP online application, 2009*
SNAP_CAP09:  SNAP Combined Application Project , 2009*
SNAP_BBCE09:  SNAP Broad-based Categorical Eligibility, 2009*
SNAP_REPORTSIMPLE09:  SNAP simplified reporting, 2009*
PCT_NSLP09:  National School Lunch Program participants (% pop), 2009*
PCT_FREE_LUNCH09:  Students eligible for free lunch (%), 2009
PCT_REDUCED_LUNCH09:  Students eligible for reduced-price lunch (%), 2009
PCT_SBP09:  School Breakfast Program participants (% pop), 2009*
PCT_SFSP09:  Summer Food Service Program participants (% pop), 2009*
PCT_WIC09:  WIC participants (% pop), 2009*
PCT_CACFP09:  Child & Adult Care (% pop), 2009*
PCT_LACCESS_POP10:  Population, low access to store (%), 2010
PCT_LACCESS_LOWI10:  Low income & low access to store (%), 2010
PCT_LACCESS_HHNV10:  Households, no car & low access to store (%), 2010
PCT_LACCESS_CHILD10:  Children, low access to store (%), 2010
PCT_LACCESS_SENIORS10:  Seniors, low access to store (%), 2010
MILK_PRICE10:  Price of low-fat milk/national average, 2010**
SODA_PRICE10:  Price of sodas/national average, 2010**
MILK_SODA_PRICE10:  Price of low-fat milk/price of sodas, 2010**
FFRPTH09:  Fast-food restaurants/1,000 pop, 2009
PC_FFRSALES07:  Expenditures per capita, fast food, 2007*
PC_FFRSALES12:  Expenditures per capita, fast food, 2012*
PC_FSRSALES07:  Expenditures per capita, restaurants, 2007*
PC_FSRSALES12:  Expenditures per capita, restaurants, 2012*
GROCPTH09:  Grocery stores/1,000 pop, 2009
CONVSPTH09:  Convenience stores/1,000 pop, 2009
SNAPSPTH12:  SNAP-authorized stores/1,000 pop, 2012
WICSPTH08:  WIC-authorized stores/1,000 pop, 2008
WICSPTH12:  WIC-authorized stores/1,000 pop, 2012
PCT_LOCLFARM07:  Farms with direct sales (%), 2007
PCT_LOCLFARM12:  Farms with direct sales (%), 2012
PCT_LOCLSALE12:  Direct farm sales (%), 2012
PC_DIRSALES07:  Direct farm sales per capita, 2007
PC_DIRSALES12:  Direct farm sales per capita, 2012
FARM_TO_SCHOOL09:  Farm to school program, 2009
FARM_TO_SCHOOL13:  Farm to school program, 2013
2010_CENSUS_POP
POP_EST_2011
POP_EST_2012
POP_EST_2013
POVRATE13_ALL
MEDHHINC13
POVRATE13_CHILD

Some features are better log'd.

In [44]:

cont_feats = X_train.columns[X_train.dtypes == float]
columns_to_log = ['PCT_NHWHITE10', 'PCT_NHBLACK10', 'PCT_LOCLFARM12'] #from iterating on the model

for c in cont_feats:
    q1 = X_train[c].quantile(0.25)
    q3 = X_train[c].quantile(0.75)
    iqr = q3-q1
    
    if iqr < (0.1)*(max(X_train[c])-min(X_train[c])):
        columns_to_log.append(c)

for c in np.unique(columns_to_log):
    X_train_min_val = np.median(X_train[c])
    X_train = convert2log(c, X_train, replacezero=X_train_min_val)
    X_test = convert2log(c, X_test, replacezero=X_train_min_val)

In [45]:

X_train.describe()

Out[45]:

	PCT_65OLDER10	FOODINSEC_10_12	VLFOODSEC_10_12	FOODINSEC_CHILD_01_07	FOODINSEC_CHILD_03_11	REDEMP_SNAPS12	PCT_SNAP12	PC_SNAPBEN10	SNAP_PART_RATE08	SNAP_PART_RATE13	...	LOG10_PCT_NHNA10	LOG10_PCT_NHWHITE10	LOG10_PCT_REDUCED_LUNCH09	LOG10_PC_DIRSALES07	LOG10_PC_DIRSALES12	LOG10_POP_EST_2011	LOG10_POP_EST_2012	LOG10_POP_EST_2013	LOG10_WICSPTH08	LOG10_WICSPTH12
count	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2.200000e+03	2200.000000	2200.000000	2200.000000	2200.000000	...	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000
mean	15.880158	15.020455	5.790818	8.596727	9.283545	2.492448e+05	15.066528	18.358982	68.310000	86.266319	...	-0.363553	1.871002	0.951654	0.603828	0.602346	4.449571	4.449523	4.449678	-0.681956	-0.711227
std	4.205627	2.360521	0.859697	1.907775	1.806446	1.230882e+05	3.624165	9.446565	10.388072	8.995722	...	0.538120	0.165124	0.159644	0.532958	0.553057	0.639121	0.640911	0.642253	0.279336	0.260403
min	3.470599	8.700000	3.200000	4.800000	5.100000	0.000000e+00	5.866367	1.374386	48.000000	56.955000	...	-1.834039	0.426172	-1.240799	-2.184197	-2.658979	1.954243	1.908485	1.949390	-1.829084	-1.836582
25%	13.137261	13.400000	5.100000	7.200000	7.900000	1.629788e+05	12.391268	11.300859	61.000000	79.024000	...	-0.699490	1.826364	0.882237	0.315982	0.290486	4.034518	4.035820	4.033444	-0.870163	-0.887251
50%	15.569573	14.900000	5.700000	8.300000	9.000000	2.457496e+05	15.223027	16.966789	66.000000	86.331000	...	-0.504170	1.933048	0.967025	0.616968	0.635756	4.409603	4.408698	4.409053	-0.720273	-0.750353
75%	18.235988	16.600000	6.500000	9.500000	10.400000	3.267956e+05	18.353499	23.615687	78.000000	92.597000	...	-0.189132	1.973788	1.043876	0.959782	0.964971	4.817666	4.818016	4.818668	-0.531058	-0.575330
max	34.129068	20.900000	8.100000	12.600000	12.800000	1.253321e+06	21.899881	76.284852	100.000000	100.000000	...	1.977475	1.996350	1.658977	2.397940	2.309440	6.717228	6.718503	6.719355	0.664542	0.476254

8 rows × 53 columns

Next, standardize the features so they are all on the same order of magnitude. Apply the same transformation to the test set. Let's proceed without the binary features 1. for simplicity and 2. the features don't show sensitivity to our target feature.

In [46]:

scaler = preprocessing.MinMaxScaler()
cont_feats = X_train.columns[X_train.dtypes == float]
cat_feats = X_train.columns[X_train.dtypes != float]

scaler.fit(X_train[cont_feats])
X_scaled = pd.DataFrame(scaler.transform(X_train[cont_feats]), \
                        index=X_train.index, columns=cont_feats)
#X_scaled = X_scaled.merge(X_train[cat_feats], left_index=True, right_index=True)

X_unscaled_test = X_test.copy()
X_test = pd.DataFrame(scaler.transform(X_test[cont_feats]), index=X_test.index, columns=cont_feats)
#X_test = X_test.merge(X_unscaled_test[cat_feats], left_index=True, right_index=True)

In [47]:

X_scaled.describe()

Out[47]:

	PCT_65OLDER10	FOODINSEC_10_12	VLFOODSEC_10_12	FOODINSEC_CHILD_01_07	FOODINSEC_CHILD_03_11	REDEMP_SNAPS12	PCT_SNAP12	PC_SNAPBEN10	SNAP_PART_RATE08	SNAP_PART_RATE13	...	LOG10_PCT_NHNA10	LOG10_PCT_NHWHITE10	LOG10_PCT_REDUCED_LUNCH09	LOG10_PC_DIRSALES07	LOG10_PC_DIRSALES12	LOG10_POP_EST_2011	LOG10_POP_EST_2012	LOG10_POP_EST_2013	LOG10_WICSPTH08	LOG10_WICSPTH12
count	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	...	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000	2200.000000
mean	0.404768	0.518070	0.528738	0.486760	0.543318	0.198868	0.573808	0.226732	0.390577	0.680946	...	0.385801	0.920169	0.756077	0.608455	0.656411	0.523900	0.528280	0.524173	0.460024	0.486569
std	0.137177	0.193485	0.175448	0.244587	0.234603	0.098210	0.226037	0.126105	0.199771	0.208984	...	0.141183	0.105163	0.055054	0.116312	0.111315	0.134185	0.133245	0.134645	0.112020	0.112590
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.315302	0.385246	0.387755	0.307692	0.363636	0.130038	0.406954	0.132511	0.250000	0.512696	...	0.297664	0.891741	0.732138	0.545636	0.593643	0.436759	0.442272	0.436912	0.384549	0.410462
50%	0.394637	0.508197	0.510204	0.448718	0.506494	0.196079	0.583569	0.208147	0.346154	0.682449	...	0.348908	0.959685	0.761377	0.611323	0.663135	0.515509	0.519793	0.515657	0.444658	0.469653
75%	0.481609	0.647541	0.673469	0.602564	0.688312	0.260744	0.778814	0.296905	0.576923	0.828017	...	0.431563	0.985630	0.787880	0.686138	0.729397	0.601183	0.604890	0.601530	0.520538	0.545327
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

8 rows × 53 columns

Dealing with Collinearity¶

We will remove feature columns with VIF > 10.

Note: Collinearity can result in unstable coefficients for linear regression. Although this is not a big concern for decision trees but feature importance can be harder to interpret.

In [48]:

vif = pd.Series([variance_inflation_factor(add_constant(X_scaled).values, i) \
                 for i in range(add_constant(X_scaled).shape[1])], index=['const']+X_scaled.columns.tolist())

/Users/jieunchoi/anaconda/envs/myenv/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.
  return ptp(axis=axis, out=out, **kwargs)

In [49]:

to_drop = vif.index[vif>10].tolist()
to_drop.remove('const')

It's interesting to observe the correlation matrix as well.

In [50]:

plt.figure(figsize=(11,10))
s = sns.heatmap(X_scaled.corr(), xticklabels=X_scaled.columns, yticklabels=X_scaled.columns)

In [51]:

print(to_drop)

['FOODINSEC_CHILD_01_07', 'FOODINSEC_CHILD_03_11', 'PCT_SNAP12', 'PCT_SBP09', 'PCT_LACCESS_POP10', 'PCT_LACCESS_CHILD10', 'MILK_PRICE10', 'SODA_PRICE10', 'MILK_SODA_PRICE10', 'PC_FFRSALES07', 'PC_FFRSALES12', 'PC_FSRSALES07', 'PC_FSRSALES12', 'POVRATE13_ALL', 'POVRATE13_CHILD', 'LOG10_2010_CENSUS_POP', 'LOG10_POP_EST_2011', 'LOG10_POP_EST_2012', 'LOG10_POP_EST_2013']

In [52]:

X_scaled = X_scaled.drop(columns=to_drop, axis=1)
X_test = X_test.drop(columns=to_drop, axis=1) #apply to the test set also so we don't forget later!
X_scaled.head()

Out[52]:

	PCT_65OLDER10	FOODINSEC_10_12	VLFOODSEC_10_12	REDEMP_SNAPS12	PC_SNAPBEN10	SNAP_PART_RATE08	SNAP_PART_RATE13	PCT_NSLP09	PCT_FREE_LUNCH09	PCT_SFSP09	...	LOG10_PCT_LOCLSALE12	LOG10_PCT_NHASIAN10	LOG10_PCT_NHBLACK10	LOG10_PCT_NHNA10	LOG10_PCT_NHWHITE10	LOG10_PCT_REDUCED_LUNCH09	LOG10_PC_DIRSALES07	LOG10_PC_DIRSALES12	LOG10_WICSPTH08	LOG10_WICSPTH12
3015	0.381701	0.204918	0.306122	0.221873	0.375637	0.711538	0.482820	0.657248	0.477718	0.313902	...	0.526186	0.395177	0.578183	0.195719	0.991125	0.724763	0.611169	0.663135	0.529675	0.510779
3059	0.486861	0.450820	0.346939	0.191609	0.142463	0.346154	1.000000	0.525096	0.378349	0.295754	...	0.723613	0.433470	0.554845	0.306724	0.991254	0.838837	0.905857	0.920026	0.581093	0.603447
2142	0.478652	0.540984	0.693878	0.304894	0.449968	0.423077	0.539250	0.724045	0.669043	0.078103	...	0.389083	0.383781	0.764714	0.797763	0.877798	0.774856	0.570662	0.571589	0.504645	0.413468
1330	0.579503	0.155738	0.326531	0.108204	0.089784	0.250000	0.689209	0.695890	0.333892	0.414491	...	0.365171	0.663391	0.451798	0.295962	0.970904	0.787307	0.686941	0.752539	0.592550	0.636105
217	0.564025	0.565574	0.510204	0.075150	0.091523	0.000000	0.220490	0.240939	0.294896	0.033815	...	0.593569	0.495355	0.480814	0.576048	0.957520	0.746627	0.539264	0.678855	0.488966	0.573735

5 rows × 34 columns

Training ML algorithms¶

Establishing a Baseline¶

Before we proceed, let's first establish a baseline performance to compare with subsequent results. A very simple baseline guess is the median of the target values. We will use two metrics, MAE (mean absolute error, so we want to minimize) and adjusted $R^2$ (MSE/variance in $y$ weighed by the numbers of observations and features, so we want to maximize to 1).

Unlike $R^2$ , adjusted $R^2$ will not arbitrary increase with an increasing number of terms if the added complexity in the model does not actually improve the prediction.

Another popular choice is RMSE (root mean square error, i.e., standard deviation) because it's easily and smoothly differentiable, but the squaring makes it pretty sensitive to outliers.

In [53]:

kf = model_selection.KFold(n_splits=5)

mae_list = []
for train_index, val_index in kf.split(X_scaled):
    X_train_kf, X_val_kf = X_scaled.iloc[train_index], X_scaled.iloc[val_index]
    y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]
    y_simple_guess = np.median(y_train_kf)  
    mae_list.append(metrics.mean_absolute_error(y_val_kf, y_simple_guess*np.ones(len(y_val_kf))))

print("MAE: {:3.2f}".format(np.mean(mae_list)))

MAE: 3.45

We will evaluate the generalized performance on the following ML algorithms, with default choices of the hyperparameters for now.

Linear Regression (without regularization, L1/L2 regularizations: features vs y are non-linear, so probably not a great choice here)
K Nearest Neighbors
Gradient Boosting (we will use the sklearn implementation)
Random Forest
Suppert Vector Machine

In [54]:

#choosing default parameters. explicitly specified  n_estimators and gamma for RF and SVM to ward off warning messages.
model_list = {'LR':LinearRegression(), 'LASSO':Lasso(), 'RIDGE':Ridge(), 'KNN':KNeighborsRegressor(), \
              'GBR':GradientBoostingRegressor(), 'RF':RandomForestRegressor(n_estimators=100), 'SVM':SVR(gamma='scale')}

In [55]:

rn = np.random.randint(0, 500)
for m in model_list.keys():
    
    kf = model_selection.KFold(n_splits=5, random_state=rn)
    mae_list = []
    adjusted_r2_list = []
    for train_index, val_index in kf.split(X_scaled):
        X_train_kf, X_val_kf = X_scaled.iloc[train_index], X_scaled.iloc[val_index]
        y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]

        clf = model_list[m]
        clf.fit(X_train_kf, y_train_kf)
        y_pred = clf.predict(X_val_kf)
        
        mae_list.append(metrics.mean_absolute_error(y_val_kf, y_pred))
        adjusted_r2_list.append(adjusted_r2(y_val_kf, y_pred, np.shape(X_train_kf)[0], np.shape(X_train_kf)[1]))
    
    print("{:4}".format(m)+" ----- ", "MAE: {:3.2f}".format(np.mean(mae_list))+'+/-'+"{:3.2f}".format(np.std(mae_list)), \
          " --  R2_adj: {:3.2f}".format(np.mean(adjusted_r2_list)))
    

LR   -----  MAE: 2.37+/-0.10  --  R2_adj: 0.53
LASSO -----  MAE: 3.45+/-0.21  --  R2_adj: -0.02
RIDGE -----  MAE: 2.36+/-0.11  --  R2_adj: 0.54
KNN  -----  MAE: 2.21+/-0.07  --  R2_adj: 0.59
GBR  -----  MAE: 2.03+/-0.11  --  R2_adj: 0.66
RF   -----  MAE: 2.04+/-0.10  --  R2_adj: 0.65
SVM  -----  MAE: 2.14+/-0.11  --  R2_adj: 0.61

Hyperparameter Tuning¶

RF and GBR both seem promising. RF is nicer to interpret, and it gives feature importance, so let's go with it.

Rather than searching the hyperparameter space via grid search (brute-force) or randomized search (slight improvement over grid search), we will turn to a Bayesian optimization technique (tree parzen estimator) via hyperopt.

First define an objective function that returns a loss to be minimized. We will use a cross validation score so that we don't have to split the data further into training & validation sets.

In [56]:

def objective(hparams):
    
    rf = RandomForestRegressor(**hparams)
    cv_score = model_selection.cross_val_score(rf, X_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
    
    loss = -np.mean(cv_score)
    
    #progress bar
    pbar.update()
    
    return {'loss': loss, 'hparams': hparams, 'status': STATUS_OK}    

Define the characteristics of the hyperparameter space over which we will conduct our search. We will choose uniform distributions for simplicity.

In [57]:

domain = {
    "n_estimators": scope.int(hp.quniform("n_estimators",50,150,2)),\
    "max_depth": scope.int(hp.quniform('max_depth', 1,10,1)),\
    "min_samples_split": scope.int(hp.quniform('min_samples_split', 2,20,1)),\
    "min_samples_leaf": scope.int(hp.quniform('min_samples_leaf', 1,10,1)),\
    "max_features": hp.uniform('max_features',0.01,1.0),\
    "min_impurity_decrease": hp.loguniform("min_impurity_decrease", np.log(1e-4),np.log(0.5)),\
}

Store the training information.

In [58]:

num_iter = 1000
pbar = tqdm_notebook(total=num_iter, desc='Hyperopt')

bayes_trials = Trials()
best = fmin(fn=objective, space=domain, algo=tpe.suggest, \
            max_evals=num_iter, trials=bayes_trials)
pbar.close()

HBox(children=(IntProgress(value=0, description='Hyperopt', max=1000, style=ProgressStyle(description_width='i…

100%|██████████| 1000/1000 [2:18:32<00:00,  8.79s/it, best loss: 2.038474812355979]

Hyperopt returns the parameters as floats, so convert the relevant ones back to integer data types.

In [59]:

#just a backup copy
best_bu = best.copy()

hyperparam_list = list(best.keys())
hyperparam_list.pop(hyperparam_list.index("max_features"))
hyperparam_list.pop(hyperparam_list.index("min_impurity_decrease"))

for key in hyperparam_list:
    best[key] = int(best[key])

In [60]:

print(best)

{'max_depth': 10, 'max_features': 0.8832777933546546, 'min_impurity_decrease': 0.0006693160693484598, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 142}

In [61]:

plt.figure(1, figsize=(12,5))
for i_k, k in enumerate(bayes_trials.vals.keys()):
    ax = plt.subplot('24'+str(i_k+1))
    ax.scatter(range(len(bayes_trials.vals[k])), bayes_trials.vals[k], label=k, s=2)
    ax.set_ylabel(' '.join(k.split('_')))
    ax.set_xlabel('iteration')
    ax.axhline(best[k], color='C1', linewidth=2.5)
    plt.tight_layout()

ax = plt.subplot('24'+str(7+1))
ax.scatter(range(len(bayes_trials.losses())), bayes_trials.losses(), s=2)
ax.set_ylim(1.9, 3)
ax.set_xlabel('iteration')
ax.set_ylabel('loss')

Out[61]:

Text(0, 0.5, 'loss')

Learning Curve¶

Before we move on to deploying the best model, let's construct a learning curve to examinie the bias-variance trade-off.

In [62]:

train_size, train_scores, test_scores = model_selection.learning_curve(RandomForestRegressor(**best), \
                                                            X_scaled, y_train, train_sizes=np.linspace(0.1, 1.0, 10),\
                                                            cv=5, scoring='neg_mean_absolute_error')
train_scores = np.abs(train_scores)
test_scores = np.abs(test_scores)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

As expected, the test (CV) error is larger than the training error (CV error was computed on previously unseen data). There appears to be some overfitting (difference in MAE of ~1, or error in the obesity rate of 1%), meaning the current model has high variance. Getting more data is not really an option, since we are limited to the # of counties in the U.S. Let's move on to examining the validation curves below and see how individual parameters of our model influence this bias-variance trade off.

In [63]:

plt.plot(train_size, train_scores_mean, linewidth=2.5, label='Training Error')
plt.fill_between(train_size, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.5)

plt.plot(train_size, test_scores_mean, linewidth=2.5, label='CV Error')
plt.fill_between(train_size, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.5)

plt.xlabel('# Training Data')
plt.ylabel('MAE')

leg = plt.legend(loc=1, frameon=False)
plt.ylim(0, 3)

Out[63]:

(0, 3)

Validation Curves¶

We will also plot the validation curve for a few hyperparameters.

In [64]:

def plot_validation_curve(p, p_arr, p_label):

    train_scores, test_scores = model_selection.validation_curve(RandomForestRegressor(**best), \
                                                            X_scaled, y_train, p, p_arr,
                                                            cv=5, scoring='neg_mean_absolute_error')

    train_scores = np.abs(train_scores)
    test_scores = np.abs(test_scores)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.plot(p_arr, train_scores_mean, linewidth=2.5, label='Training Error')
    plt.fill_between(p_arr, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.5)
    
    plt.plot(p_arr, test_scores_mean, linewidth=2.5, label='CV Error')
    plt.fill_between(p_arr, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.5)
    
    plt.xlabel(p_label)
    plt.ylabel('MAE')
    
    plt.ylim(0, 3)
    
    leg = plt.legend(loc=1, frameon=False)

First, n_estimators is the number of decision trees. We see that the two errors quickly diverge beyond ~50, and CV errors asymptotes while training error continues to decrease: as we increase the number of estimators, the training set becomes increasingly overfit.

In [65]:

plot_validation_curve('n_estimators', np.arange(10, 100, 20), 'Number of Estimators')

The deeper the individual tree goes, the more it will overfit.

In [66]:

plot_validation_curve('max_depth', np.arange(1,11), 'Maximum Tree Depth')

The fraction of features used in each split in the decision tree.

In [67]:

plot_validation_curve('max_features', np.linspace(0.01, 1, 10), 'Maximum Features')

Note that all CV errors plateau to ~2% regardless of the choice of parameters, # samples, etc.

Predict the Obesity Rates¶

Train on the full train data set then check the MAE and the adjusted $R^2$ scores on the test set.

In [68]:

rf_tuned = RandomForestRegressor(**best)
rf_tuned.fit(X_scaled, y_train)
y_pred = rf_tuned.predict(X_test)
print("{:4}".format('RF')+" ----- ", "MAE: {:3.2f}".format(metrics.mean_absolute_error(y_test, y_pred)), \
      " --  R2_adj: {:3.2f}".format(adjusted_r2(y_test, y_pred, np.shape(X_test)[0], np.shape(X_test)[1])))

RF   -----  MAE: 2.01  --  R2_adj: 0.68

In terms of mean absolute error, this is a nearly 70% improvement over the naive baseline model (median)!

Feature Importance¶

Let's examine which features played a crucial role in predicting the obesity rates. The feature importance of a single predictor variable is defined by combining the following: the fraction of the samples split according to the said variable—a measure of its "sphere of influence," if you will—and the decrease in Gini impurity resulting from the said split. Feature importances are computed for a given decision tree, and these values are averaged over the ensemble of trees to provide the final list of feature importances that we will examine here.

The most importance predictors by far are related to socioeconomic status.

In [69]:

feat_imp = rf_tuned.feature_importances_
feat_imp_ind = np.argsort(feat_imp)[-20:]

s = sns.barplot(x=feat_imp[feat_imp_ind], y=X_scaled.columns[feat_imp_ind])