%matplotlib inline
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import random
import csv
import scipy as sp
from statsmodels.graphics import gofplots
from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import preprocessing, compose
from sklearn import model_selection
from sklearn import metrics
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from hyperopt import STATUS_OK, hp, tpe, fmin, Trials
from hyperopt.pyll.stochastic import sample
from hyperopt.pyll.base import scope
from tqdm import tqdm_notebook
sns.set_context("notebook")
The FEA is a publicly available dataset compiled by the US Department of Agriculture. According to the website,
Food environment factors—such as store/restaurant proximity, food prices, food and nutrition assistance programs, and community characteristics—interact to influence food choices and diet quality. These interactions are complex and more research is needed to identify causal relationships and effective policy interventions.
The objectives of the Atlas are:
to assemble statistics on food environment indicators to stimulate research on the determinants of food choices and diet quality, and
to provide a spatial overview of a community's ability to access healthy food and its success in doing so."
This is a compilation of over 275 food environment indicators for every U.S. county. The data were collected from a variety of sources between the years 2007 and 2016.
The objective is to predict the adult obesity rate for 2013 (see the documentation for more details) based on the available data.
def adjusted_r2(y_true, y_pred, n, k):
"""
Computes the adjusted R^2 score for regression.
Arguments
---------
y_true: target variable value
y_pred: predicted target variable value from model
n: number of observations (rows)
k: number of features (columns)
Keywords
--------
None
Returns
-------
r2_adj: adjusted R^2 score
"""
from sklearn import metrics
r2_adj = 1.0 - (1.0 - metrics.r2_score(y_true, y_pred))*(n-1)/(n-k-1)
return r2_adj
def convert2log(label, X_orig, drop_orig=True, replacezero=0.0):
"""
Replaces a column in a pandas dataframe with log10 of that value.
Arguments
---------
label: feature label of the column to be log10'd
X: pandas dataframe
Keywords
--------
drop_orig: if True, discards the original column after creating its log10 version
Returns
-------
X: pandas dataframe
"""
try:
X = X_orig.copy()
X[label] = X[label].apply(lambda x: replacezero if x == 0 else x)
X['LOG10_'+label] = X[label].apply(np.log10)
if drop_orig:
X = X.drop(axis=1, labels=[label])
except (ValueError, AttributeError, TypeError) as e:
print("Cannot take log10 of this data type.")
return X
def variable_lookup(label):
"""
Looks up and prints the variable definition in the lookup table.
Arguments
---------
label: feature label
Keywords
--------
None
Returns
-------
None
"""
lookup_table = pd.read_csv("/Users/jieunchoi/DataScience/food_environment_atlas/csv_files/meta/Variable List-Table 1.csv")
try:
print(label+': ',lookup_table[lookup_table['Variable Code'] == label]['Variable Name'].values[0])
except NameError:
print(label+" is not in the lookup table.")
def convert_state_names(n, abbr=False):
"""
Converts between the state abbreviation and the full name.
List taken from https://gist.github.com/rogerallen/1583593.
Arguments
---------
n: state name in either the abbreviated or fully-spelled out format
Keywords
--------
abbr: input is in the abbreviated format
Returns
-------
result: state name in either the abbreviated or fully-spelled out format
"""
us_state_abbrev = {
'Alabama': 'AL',
'Alaska': 'AK',
'Arizona': 'AZ',
'Arkansas': 'AR',
'California': 'CA',
'Colorado': 'CO',
'Connecticut': 'CT',
'Delaware': 'DE',
'District of Columbia': 'DC',
'Florida': 'FL',
'Georgia': 'GA',
'Hawaii': 'HI',
'Idaho': 'ID',
'Illinois': 'IL',
'Indiana': 'IN',
'Iowa': 'IA',
'Kansas': 'KS',
'Kentucky': 'KY',
'Louisiana': 'LA',
'Maine': 'ME',
'Maryland': 'MD',
'Massachusetts': 'MA',
'Michigan': 'MI',
'Minnesota': 'MN',
'Mississippi': 'MS',
'Missouri': 'MO',
'Montana': 'MT',
'Nebraska': 'NE',
'Nevada': 'NV',
'New Hampshire': 'NH',
'New Jersey': 'NJ',
'New Mexico': 'NM',
'New York': 'NY',
'North Carolina': 'NC',
'North Dakota': 'ND',
'Ohio': 'OH',
'Oklahoma': 'OK',
'Oregon': 'OR',
'Pennsylvania': 'PA',
'Rhode Island': 'RI',
'South Carolina': 'SC',
'South Dakota': 'SD',
'Tennessee': 'TN',
'Texas': 'TX',
'Utah': 'UT',
'Vermont': 'VT',
'Virginia': 'VA',
'Washington': 'WA',
'West Virginia': 'WV',
'Wisconsin': 'WI',
'Wyoming': 'WY',
}
try:
if abbr:
result = [x for x in us_state_abbrev.keys() if us_state_abbrev[x] == n][0]
return(result)
else:
return(us_state_abbrev[n])
except (KeyError, IndexError) as e:
print('Cannot convert the name. Check the spelling and/or the keyword option.')
Merge the individual tables to make one master dataframe.
df_list = glob.glob('csv_files/*csv')
for i, df_file in enumerate(df_list):
print(i, df_file.split('files/')[-1].split('-Table')[0])
df = pd.read_csv(df_file)
df.rename(columns=lambda x: x.upper().strip(), inplace=True)
if i == 0:
master_df = df.copy()
else:
df.drop(columns=['STATE', 'COUNTY'], inplace=True)
master_df = master_df.merge(df, on=['FIPS'], how='left')
0 SOCIOECONOMIC 1 INSECURITY 2 ASSISTANCE 3 ACCESS 4 HEALTH 5 PRICES_TAXES 6 RESTAURANTS 7 STORES 8 LOCAL
master_df.head()
FIPS | STATE | COUNTY | PCT_NHWHITE10 | PCT_NHBLACK10 | PCT_HISP10 | PCT_NHASIAN10 | PCT_NHNA10 | PCT_NHPI10 | PCT_65OLDER10 | ... | CSA12 | PCH_CSA_07_12 | AGRITRSM_OPS07 | AGRITRSM_OPS12 | PCH_AGRITRSM_OPS_07_12 | AGRITRSM_RCT07 | AGRITRSM_RCT12 | PCH_AGRITRSM_RCT_07_12 | FARM_TO_SCHOOL09 | FARM_TO_SCHOOL13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | AL | Autauga | 77.246156 | 17.582599 | 2.400542 | 0.855766 | 0.397647 | 0.040314 | 11.995382 | ... | 3.0 | 50.000000 | 7.0 | 10.0 | 42.857143 | 228000.0 | 146000.0 | -35.964912 | 0.0 | NaN |
1 | 1003 | AL | Baldwin | 83.504787 | 9.308425 | 4.384824 | 0.735193 | 0.628755 | 0.043343 | 16.771185 | ... | 7.0 | -46.153846 | 18.0 | 16.0 | -11.111111 | 124000.0 | 204000.0 | 64.516129 | 0.0 | 0.0 |
2 | 1005 | AL | Barbour | 46.753105 | 46.691190 | 5.051535 | 0.389700 | 0.218524 | 0.087409 | 14.236807 | ... | 0.0 | -100.000000 | 27.0 | 32.0 | 18.518519 | 163000.0 | 304000.0 | 86.503067 | 0.0 | 1.0 |
3 | 1007 | AL | Bibb | 75.020729 | 21.924504 | 1.771765 | 0.096007 | 0.279293 | 0.030548 | 12.681650 | ... | 3.0 | 50.000000 | 5.0 | 6.0 | 20.000000 | NaN | 21000.0 | NaN | 0.0 | 0.0 |
4 | 1009 | AL | Blount | 88.887338 | 1.263040 | 8.070200 | 0.200621 | 0.497191 | 0.031402 | 14.722096 | ... | 4.0 | -42.857143 | 10.0 | 8.0 | -20.000000 | 293000.0 | 30000.0 | -89.761092 | 0.0 | 1.0 |
5 rows × 280 columns
Data types are all numbers except for two, STATE and COUNTY.
master_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3143 entries, 0 to 3142 Columns: 280 entries, FIPS to FARM_TO_SCHOOL13 dtypes: float64(248), int64(30), object(2) memory usage: 6.7+ MB
Add population data. We need to work a little bit on the column names in order to merge this df seamlessly onto the master data frame.
df_county = pd.read_csv("csv_files/meta/Supplemental Data - County-Table 1.csv")
df_county.dropna(axis=0, how="any", inplace=True)
df_county.rename(columns=lambda x: "_".join(x.upper().\
replace(",","").replace("POPULATION","POP").replace("ESTIMATE","EST").strip().split()), inplace=True)
col_tmp = df_county.columns.drop(['FIPS', 'STATE', 'COUNTY'])
df_county[col_tmp] = df_county[col_tmp].apply(lambda x: pd.to_numeric(x.astype(str).str.replace(',','')))
df_county[col_tmp] = df_county[col_tmp].apply(pd.to_numeric).astype('float64')
df_county.drop(columns=['STATE', 'COUNTY'], inplace=True)
master_df = master_df.merge(df_county, on=['FIPS'], how='left')
master_df.info()
master_df.head()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3143 entries, 0 to 3142 Columns: 287 entries, FIPS to POP_EST_2016 dtypes: float64(255), int64(30), object(2) memory usage: 6.9+ MB
FIPS | STATE | COUNTY | PCT_NHWHITE10 | PCT_NHBLACK10 | PCT_HISP10 | PCT_NHASIAN10 | PCT_NHNA10 | PCT_NHPI10 | PCT_65OLDER10 | ... | PCH_AGRITRSM_RCT_07_12 | FARM_TO_SCHOOL09 | FARM_TO_SCHOOL13 | 2010_CENSUS_POP | POP_EST_2011 | POP_EST_2012 | POP_EST_2013 | POP_EST_2014 | POP_EST_2015 | POP_EST_2016 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | AL | Autauga | 77.246156 | 17.582599 | 2.400542 | 0.855766 | 0.397647 | 0.040314 | 11.995382 | ... | -35.964912 | 0.0 | NaN | 54571.0 | 55255.0 | 55027.0 | 54792.0 | 54977.0 | 55035.0 | 55416.0 |
1 | 1003 | AL | Baldwin | 83.504787 | 9.308425 | 4.384824 | 0.735193 | 0.628755 | 0.043343 | 16.771185 | ... | 64.516129 | 0.0 | 0.0 | 182265.0 | 186653.0 | 190403.0 | 195147.0 | 199745.0 | 203690.0 | 208563.0 |
2 | 1005 | AL | Barbour | 46.753105 | 46.691190 | 5.051535 | 0.389700 | 0.218524 | 0.087409 | 14.236807 | ... | 86.503067 | 0.0 | 1.0 | 27457.0 | 27326.0 | 27132.0 | 26938.0 | 26763.0 | 26270.0 | 25965.0 |
3 | 1007 | AL | Bibb | 75.020729 | 21.924504 | 1.771765 | 0.096007 | 0.279293 | 0.030548 | 12.681650 | ... | NaN | 0.0 | 0.0 | 22915.0 | 22736.0 | 22645.0 | 22501.0 | 22511.0 | 22561.0 | 22643.0 |
4 | 1009 | AL | Blount | 88.887338 | 1.263040 | 8.070200 | 0.200621 | 0.497191 | 0.031402 | 14.722096 | ... | -89.761092 | 0.0 | 1.0 | 57322.0 | 57707.0 | 57772.0 | 57746.0 | 57621.0 | 57676.0 | 57704.0 |
5 rows × 287 columns
Let's add some relevant columns from the 2013 US Census data since the main dataset only contains poverty rate and median income for 2015.
poverty_income_df_unf = pd.read_csv('csv_files/census/est13all.csv')
Drop empty header rows and reset the index. Select only the columns we want to append to the master data frame.
poverty_income_df = poverty_income_df_unf.drop(labels=[0,1,2,3], axis=0).copy()
poverty_income_df.columns = poverty_income_df_unf.iloc[2]
poverty_income_df = poverty_income_df.reset_index(drop=True)
poverty_income_df = poverty_income_df[['State FIPS Code', 'County FIPS Code', \
'Postal Code', 'Poverty Percent, All Ages', \
'Median Household Income', 'Poverty Percent, Age 0-17']]
poverty_income_df['Median Household Income'] = poverty_income_df['Median Household Income'].apply(lambda x: x.replace(",",""))
Rename columns, convert to the appropriate data types, and drop rows containing overall state data.
poverty_income_cols = poverty_income_df.columns.drop(labels=['State FIPS Code', 'County FIPS Code', 'Postal Code'])
poverty_income_df[poverty_income_cols] = poverty_income_df[poverty_income_cols].apply(pd.to_numeric, errors='coerce')
poverty_income_df = poverty_income_df.drop(poverty_income_df[poverty_income_df['County FIPS Code'] == '000'].index)
poverty_income_df.columns = ['STATE_FIPS', 'COUNTY_FIPS', \
'STATE', 'POVRATE13_ALL', \
'MEDHHINC13', 'POVRATE13_CHILD']
poverty_income_df['FIPS'] = poverty_income_df.apply(lambda row: int(row.STATE_FIPS)*1000+int(row.COUNTY_FIPS), axis=1)
poverty_income_df = poverty_income_df.drop(columns=['STATE_FIPS', 'COUNTY_FIPS', 'STATE'])
Merge with the main data frame.
master_df = master_df.merge(poverty_income_df, on=['FIPS'], how='left')
master_df.info()
master_df.head()
<class 'pandas.core.frame.DataFrame'> Int64Index: 3143 entries, 0 to 3142 Columns: 290 entries, FIPS to POVRATE13_CHILD dtypes: float64(258), int64(30), object(2) memory usage: 7.0+ MB
FIPS | STATE | COUNTY | PCT_NHWHITE10 | PCT_NHBLACK10 | PCT_HISP10 | PCT_NHASIAN10 | PCT_NHNA10 | PCT_NHPI10 | PCT_65OLDER10 | ... | 2010_CENSUS_POP | POP_EST_2011 | POP_EST_2012 | POP_EST_2013 | POP_EST_2014 | POP_EST_2015 | POP_EST_2016 | POVRATE13_ALL | MEDHHINC13 | POVRATE13_CHILD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | AL | Autauga | 77.246156 | 17.582599 | 2.400542 | 0.855766 | 0.397647 | 0.040314 | 11.995382 | ... | 54571.0 | 55255.0 | 55027.0 | 54792.0 | 54977.0 | 55035.0 | 55416.0 | 13.5 | 51868.0 | 19.6 |
1 | 1003 | AL | Baldwin | 83.504787 | 9.308425 | 4.384824 | 0.735193 | 0.628755 | 0.043343 | 16.771185 | ... | 182265.0 | 186653.0 | 190403.0 | 195147.0 | 199745.0 | 203690.0 | 208563.0 | 14.2 | 47539.0 | 20.8 |
2 | 1005 | AL | Barbour | 46.753105 | 46.691190 | 5.051535 | 0.389700 | 0.218524 | 0.087409 | 14.236807 | ... | 27457.0 | 27326.0 | 27132.0 | 26938.0 | 26763.0 | 26270.0 | 25965.0 | 28.2 | 30981.0 | 42.4 |
3 | 1007 | AL | Bibb | 75.020729 | 21.924504 | 1.771765 | 0.096007 | 0.279293 | 0.030548 | 12.681650 | ... | 22915.0 | 22736.0 | 22645.0 | 22501.0 | 22511.0 | 22561.0 | 22643.0 | 23.1 | 39781.0 | 35.6 |
4 | 1009 | AL | Blount | 88.887338 | 1.263040 | 8.070200 | 0.200621 | 0.497191 | 0.031402 | 14.722096 | ... | 57322.0 | 57707.0 | 57772.0 | 57746.0 | 57621.0 | 57676.0 | 57704.0 | 17.2 | 44392.0 | 26.6 |
5 rows × 290 columns
Inspect the variable list to figure out which data columns should be dropped. We want to discard data columns that are redundant (e.g., % change given values from two years' worth of data, which are already included as part of the data set) or non-normalized to the population (e.g., counts). Let's also discard data from 2014 and beyond as we'll make predictions for 2013.
variable_df = pd.read_csv('/Users/jieunchoi/DataScience/food_environment_atlas/csv_files/meta/Variable List-Table 1.csv')
print(np.unique(variable_df.Units))
['# per 1,000 pop' '% change' '1,000 dollars' 'Acres' 'Acres/1,000 pop' 'Classification' 'Count' 'Dollars' 'Dollars/capita' 'Dollars/store' 'Legend' 'Percent' 'Percentage points' 'Ratio' 'Sq ft' 'Sq ft/1,000 pop']
data13 = [x for x in variable_df['Variable Code'] if int(x[-2:]) > 13]
badunits = ['% change', 'Sq ft', 'Percentage points', '1,000 dollars', 'Count', 'Dollars']
bad_variable_df = variable_df[(variable_df['Units'].isin(badunits))|(variable_df['Variable Code'].isin(data13))]
good_variable_df = variable_df[(~variable_df['Units'].isin(badunits))&(~variable_df['Variable Code'].isin(data13))]
Use the selected column names to filter the master data frame. Also drop >2013 columns added from the supplemental table.
master_df = master_df.drop(columns=bad_variable_df['Variable Code'].tolist()+['POP_EST_2014','POP_EST_2015','POP_EST_2016'])
Change the data type of categorical variables. Also there's one column that is an integer column, which we can safely change to a float just for uniformity.
cat_vars = good_variable_df['Variable Code'][(good_variable_df.Units == 'Classification')|(good_variable_df.Units == 'Legend')]
master_df[cat_vars] = master_df[cat_vars].astype("category")
master_df['FIPS'] = master_df['FIPS'].astype("object")
master_df[master_df.columns[master_df.dtypes == int]] = master_df[master_df.columns[master_df.dtypes == int]].astype(float)
master_df.head()
FIPS | STATE | COUNTY | PCT_NHWHITE10 | PCT_NHBLACK10 | PCT_HISP10 | PCT_NHASIAN10 | PCT_NHNA10 | PCT_NHPI10 | PCT_65OLDER10 | ... | GHVEG_SQFTPTH12 | FARM_TO_SCHOOL09 | FARM_TO_SCHOOL13 | 2010_CENSUS_POP | POP_EST_2011 | POP_EST_2012 | POP_EST_2013 | POVRATE13_ALL | MEDHHINC13 | POVRATE13_CHILD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | AL | Autauga | 77.246156 | 17.582599 | 2.400542 | 0.855766 | 0.397647 | 0.040314 | 11.995382 | ... | 0.000000 | 0.0 | NaN | 54571.0 | 55255.0 | 55027.0 | 54792.0 | 13.5 | 51868.0 | 19.6 |
1 | 1003 | AL | Baldwin | 83.504787 | 9.308425 | 4.384824 | 0.735193 | 0.628755 | 0.043343 | 16.771185 | ... | 326.477447 | 0.0 | 0.0 | 182265.0 | 186653.0 | 190403.0 | 195147.0 | 14.2 | 47539.0 | 20.8 |
2 | 1005 | AL | Barbour | 46.753105 | 46.691190 | 5.051535 | 0.389700 | 0.218524 | 0.087409 | 14.236807 | ... | 0.000000 | 0.0 | 1.0 | 27457.0 | 27326.0 | 27132.0 | 26938.0 | 28.2 | 30981.0 | 42.4 |
3 | 1007 | AL | Bibb | 75.020729 | 21.924504 | 1.771765 | 0.096007 | 0.279293 | 0.030548 | 12.681650 | ... | NaN | 0.0 | 0.0 | 22915.0 | 22736.0 | 22645.0 | 22501.0 | 23.1 | 39781.0 | 35.6 |
4 | 1009 | AL | Blount | 88.887338 | 1.263040 | 8.070200 | 0.200621 | 0.497191 | 0.031402 | 14.722096 | ... | 0.000000 | 0.0 | 1.0 | 57322.0 | 57707.0 | 57772.0 | 57746.0 | 17.2 | 44392.0 | 26.6 |
5 rows × 98 columns
Spot check the number of counties attributed to each state. GA has 159 counties compared to CA's 58! Remarkable.
master_df = master_df.drop_duplicates(['FIPS'])
state_county_df = master_df[['STATE', 'COUNTY']].groupby(by='STATE').count()
display(state_county_df.head(n=15))
COUNTY | |
---|---|
STATE | |
AK | 29 |
AL | 67 |
AR | 75 |
AZ | 15 |
CA | 58 |
CO | 64 |
CT | 8 |
DC | 1 |
DE | 3 |
FL | 67 |
GA | 159 |
HI | 5 |
IA | 99 |
ID | 44 |
IL | 102 |
Perform additional basic checks for problematic data.
There are some feature columns (float) that are mostly 0. For instance, there are some counties with no Pacific Islander population and no recreation & fitness facilities. We will drop columns with 10% of the data = 0.
for c in master_df.columns[master_df.dtypes == float]:
num_zeros = len(master_df[c][master_df[c] == 0])
num_rows = master_df.shape[0]
if num_zeros > num_rows*0.1:
print(c, num_zeros)
master_df.drop(columns=c, inplace=True)
PCT_NHPI10 485 RECFACPTH09 893 SUPERCPTH09 1489 SPECSPTH09 1140 FMRKTPTH09 1362 ORCHARD_ACRES07 360 ORCHARD_ACRES12 351 ORCHARD_ACRESPTH07 360 ORCHARD_ACRESPTH12 350 BERRY_ACRES07 844 BERRY_ACRES12 742 BERRY_ACRESPTH07 844 BERRY_ACRESPTH12 741 GHVEG_SQFTPTH07 1879 GHVEG_SQFTPTH12 1412
Next, we will examine missing data. Most of the columns that are missing >100 rows seem hyperspecific and understandably irrelevant to many counties, for example FRESHVEG_ACRES07, the number of acres of vegetables harvested for fresh marke in 2007. For simplicity, let's drop the columns with >300 rows of missing data. This will remove mainly agriculture/farm-related columns.
count_null_values = master_df.isnull().sum().sort_values(ascending=False)
print('There are',np.shape(master_df)[0],'rows total. The following columns are missing >100 rows.')
missing_gt100 = count_null_values[count_null_values > 100].copy()
print(missing_gt100)
missing_gt300 = count_null_values[(count_null_values > 300)].copy()
print('Dropping', len(missing_gt300),'columns with >300 missing rows.')
master_df = master_df.drop(labels=missing_gt300.index, axis=1)
There are 3143 rows total. The following columns are missing >100 rows. FRESHVEG_ACRESPTH07 1300 FRESHVEG_ACRES07 1300 FRESHVEG_ACRESPTH12 1227 FRESHVEG_ACRES12 1226 REDEMP_WICS12 1139 PC_WIC_REDEMP12 1139 REDEMP_WICS08 1064 PC_WIC_REDEMP08 1064 VEG_ACRESPTH12 612 VEG_ACRESPTH07 611 VEG_ACRES07 611 VEG_ACRES12 562 PCT_LOCLSALE07 346 PC_DIRSALES07 288 PCT_LOCLSALE12 287 PC_DIRSALES12 243 REDEMP_SNAPS12 242 FARM_TO_SCHOOL13 208 dtype: int64 Dropping 13 columns with >300 missing rows.
Let's deal with the missing data via imputing. We will fill in missing rows using the median value for each state. For columns that are missing data for entire states, we will enter the national median value.
cols_w_null_values = master_df.columns[master_df.isnull().any()]
for c in cols_w_null_values:
try:
master_df[c] = master_df.groupby('STATE')[c].apply(lambda x: x.fillna(x.median()))
except TypeError:
#for categorical variables, fill with the most frequent value, which is returned first in value_counts
master_df[c] = master_df.groupby('STATE')[c].apply(lambda x: x.fillna(x.value_counts().index[0]))
/Users/jieunchoi/anaconda/envs/myenv/lib/python3.6/site-packages/numpy/lib/nanfunctions.py:1076: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims)
cols_w_null_values = master_df.columns[master_df.isnull().any()]
for c in cols_w_null_values:
try:
master_df[c] = master_df[c].fillna(master_df[c].median())
except TypeError:
master_df[c] = master_df[c].fillna(master_df[c].value_counts().index[0])
print('Any remaining missing values?: ', master_df.isnull().values.any())
print('There are a total of', np.shape(master_df)[0], 'counties (rows) and', np.shape(master_df)[1], 'features (columns).')
Any remaining missing values?: False There are a total of 3143 counties (rows) and 70 features (columns).
master_df.head()
FIPS | STATE | COUNTY | PCT_NHWHITE10 | PCT_NHBLACK10 | PCT_HISP10 | PCT_NHASIAN10 | PCT_NHNA10 | PCT_65OLDER10 | PCT_18YOUNGER10 | ... | PC_DIRSALES12 | FARM_TO_SCHOOL09 | FARM_TO_SCHOOL13 | 2010_CENSUS_POP | POP_EST_2011 | POP_EST_2012 | POP_EST_2013 | POVRATE13_ALL | MEDHHINC13 | POVRATE13_CHILD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1001 | AL | Autauga | 77.246156 | 17.582599 | 2.400542 | 0.855766 | 0.397647 | 11.995382 | 26.777959 | ... | 5.582238 | 0.0 | 0.0 | 54571.0 | 55255.0 | 55027.0 | 54792.0 | 13.5 | 51868.0 | 19.6 |
1 | 1003 | AL | Baldwin | 83.504787 | 9.308425 | 4.384824 | 0.735193 | 0.628755 | 16.771185 | 22.987408 | ... | 3.403433 | 0.0 | 0.0 | 182265.0 | 186653.0 | 190403.0 | 195147.0 | 14.2 | 47539.0 | 20.8 |
2 | 1005 | AL | Barbour | 46.753105 | 46.691190 | 5.051535 | 0.389700 | 0.218524 | 14.236807 | 21.906982 | ... | 0.478663 | 0.0 | 1.0 | 27457.0 | 27326.0 | 27132.0 | 26938.0 | 28.2 | 30981.0 | 42.4 |
3 | 1007 | AL | Bibb | 75.020729 | 21.924504 | 1.771765 | 0.096007 | 0.279293 | 12.681650 | 22.696923 | ... | 0.883314 | 0.0 | 0.0 | 22915.0 | 22736.0 | 22645.0 | 22501.0 | 23.1 | 39781.0 | 35.6 |
4 | 1009 | AL | Blount | 88.887338 | 1.263040 | 8.070200 | 0.200621 | 0.497191 | 14.722096 | 24.608353 | ... | 8.567571 | 0.0 | 1.0 | 57322.0 | 57707.0 | 57772.0 | 57746.0 | 17.2 | 44392.0 | 26.6 |
5 rows × 70 columns
Let's begin by exploring the distributions of some parameters. First, our target variable: adult obesity rate is the percent of the population with age > 20 and BMI > 30.
s = sns.distplot(master_df['PCT_OBESE_ADULTS13'].dropna())
_ = s.set_xlabel('2013 Obesity Rate [%]')
_ = s.set_ylabel('pdf')
Just as an aside, construct a QQ plot and run the Shapiro-Wilk test to test normality (H0=normal distribution). We see that these two tests rule out normality for our obesity rate distribution.
For linear regression: normality in the dependent variable is not a required assumption. Rather, the assumption is that the residual errors—any variation not explained by the features—are normally distributed. Some references say that residual normality does not need to be. satisfied as the sample size increases (CLT)
f, ax = plt.subplots(1,1,figsize=(5,5))
fig = gofplots.qqplot(master_df['PCT_OBESE_ADULTS13'], line='s', ax=ax, alpha=0.2, c='#1f77b4')
st, p = sp.stats.shapiro(master_df['PCT_OBESE_ADULTS13'])
print('Shapiro-Wilk normality test: W={:6.5f}, p={:6.3e}'.format(st, p))
Shapiro-Wilk normality test: W=0.99042, p=1.075e-13
Let's look at some other parameter distributions. There's a pretty remarkable spread in the population size of the county (also, look how log-symmetric-ish!). Adult diabetes rate is the percentage of the population with age > 20 and diabetes (e.g., types 1 and 2, and excluding gestational diabetes). Poverty rate is the percentage of the population with income below the poverty threshold, which varies by family size, age of the head of the household, etc. For 2017, it was approximately $12,500 for an individual and $25,100 for a family of 4. Low access to store is defined as the percentage of the population living > 1 mile (> 10 miles) from the nearest supermarket or grocery store in an urban (rural) area. The last distribution is strongly skewed to higher % of low access to store.
f, axes = plt.subplots(1,4,figsize=(15,4))
s = sns.distplot(np.log10(master_df['2010_CENSUS_POP'].dropna()), ax=axes[0])
s.set_xlabel('log(2010 Census Population)')
s = sns.distplot(master_df['PCT_DIABETES_ADULTS13'].dropna(), ax=axes[1])
s.set_xlabel('2013 Adult Diabetes Rate [%]')
s = sns.distplot(master_df['POVRATE13_ALL'].dropna(), ax=axes[2])
s.set_xlabel('2013 All Ages Poverty Rate [%]')
s = sns.distplot(master_df['PCT_LACCESS_POP10'].dropna(), ax=axes[3])
s.set_xlabel('2010 Low Access to Store [%]')
f.tight_layout()
min_pop_ind = master_df['2010_CENSUS_POP'].dropna().idxmin()
max_pop_ind = master_df['2010_CENSUS_POP'].dropna().idxmax()
print('According to the 2010 Census, county population ranges from ',
master_df['COUNTY'][min_pop_ind], master_df['STATE'][min_pop_ind],':',
int(min(master_df['2010_CENSUS_POP'].dropna())),
'to',
master_df['COUNTY'][max_pop_ind], master_df['STATE'][max_pop_ind],':',
int(max(master_df['2010_CENSUS_POP'].dropna())))
According to the 2010 Census, county population ranges from Loving TX : 82 to Los Angeles CA : 9818605
Let's briefly examine the counties that have 100% of their population experiencing low access to store to understand the extreme values. After spot-checking a few counties along with a cross-reference Google map search, it seems that what appear to be regular grocery stores do exist near residential areas in these counties. Some guesses for why these counties are labeled to have low access to grocery stores for ~100% of their population are 1. these grocery stores do not meet the somewhat strict definition of "supermarket/large grocery store," e.g., more than $2 million in annual sales, 2. population data were not properly allocated to their 0.5 sq. km grids. The 100% designation does appear to be correct for some of these counties (e.g., for Kiowa, CO, the nearest large grocery store, Safeway, is ~8 miles from town center) while for others, it is more ambiguous. Thus we will proceed without attempting to correct or remove these potential outliers, especially given their small numbers.
master_df[['STATE', 'COUNTY', 'METRO13', 'PCT_LACCESS_POP10']][master_df['PCT_LACCESS_POP10']>99.99].head()
STATE | COUNTY | METRO13 | PCT_LACCESS_POP10 | |
---|---|---|---|---|
72 | AK | Denali | 0 | 100.000001 |
81 | AK | Lake and Peninsula | 0 | 100.000000 |
95 | AK | Yukon-Koyukuk | 0 | 100.000000 |
271 | CO | Hinsdale | 0 | 99.999777 |
275 | CO | Kiowa | 0 | 100.000000 |
We can also examine the prevalence of adult obesity broken down separately for metro/nonmetro counties and counties with persistent poverty (poverty rate > 20% in the 1980, 1990, and 2000 censuses and the American Community Survey 5-year estimates for 2007-2011).
g = sns.FacetGrid(master_df, row='METRO13', col='PERPOV10', height=4)
g.map(sns.distplot, 'PCT_OBESE_ADULTS13').set_axis_labels("2013 Obesity Rate [%]", "pdf")
axes = g.axes.flatten()
axes[0].set_title("Non-Metro/No Persistent Poverty")
axes[1].set_title("Non-Metro/Persistent Poverty")
axes[2].set_title("Metro/No Persistent Poverty")
axes[3].set_title("Metro/Persistent Poverty")
Text(0.5, 1.0, 'Metro/Persistent Poverty')
Let's now move onto more explicitly examining relationships between obesity and the available parameters.
The positive trend between poverty and obesity rate is not too surprising if we are to believe that poverty is associated with poor food choices, which is in turn associated with poor health/obesity. There is a large spread in the obesity rate values as a function of average milk price, though there may be a weak non-linear relationship. There isn't a super obvious, strong correlation between the prevalence of FF restaurants and obesity rates. Based on studies such as this, we know that obesity disproportionately affects African-Americans and Mexican-Americans. This is confirmed by the data (for the African-American population at least, as separate Mexican-American data is not available).
f, axes = plt.subplots(1,4,figsize=(15,4))
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='POVRATE13_ALL', data=master_df, ax=axes[0], s=5)
s.set_xlabel('2013 All Ages Poverty Rate [%]')
s.set_ylabel('2013 Adult Obesity [%]')
s.set_xlim(0,50)
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='MILK_PRICE10', data=master_df, ax=axes[1], s=5)
s.set_xlabel('2010 Milk Price/National Average')
s.set_ylabel('2013 Adult Obesity [%]')
s.set_xlim(0.7,1.5)
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='FFRPTH09', data=master_df, ax=axes[2], s=5)
s.set_xlabel('2009 Fast-Food Restaurants/1000')
s.set_xlim(0,1.5)
s.set_ylabel('2013 Adult Obesity [%]')
s = sns.scatterplot(y='PCT_OBESE_ADULTS13', x='PCT_NHBLACK10', data=master_df, ax=axes[3], s=5, legend='full')
s.set_xlabel('2010 Black & African/American [%]')
s.set_ylabel('2013 Adult Obesity [%]')
s.set_xlim(0,90)
f.tight_layout()
We can examine the same distributions broken up by race (above/below the national average value according to the 2010 US. Census). It's interesting that obesity and diabetes rate distributions for counties with the fraction of the black/African-American population exceeding vs. below the national average (12.6%) show visible differences. This does not hold true for counties with fraction of the white population exceeding/below the national average (72.4%).
small_df = master_df[['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13', 'PCT_NHBLACK10']].copy()
small_df['BLACK10_ABVNATAVG'] = small_df.apply(lambda x: int(x['PCT_NHBLACK10'] > 12.6), axis=1)
sns.pairplot(small_df, vars=['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13'], hue='BLACK10_ABVNATAVG')
<seaborn.axisgrid.PairGrid at 0x117e5ec50>
small_df = master_df[['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13', 'PCT_NHWHITE10']].copy()
small_df['WHITE10_ABVNATAVG'] = small_df.apply(lambda x: int(x['PCT_NHWHITE10'] > 72.4), axis=1)
sns.pairplot(small_df, vars=['PCT_OBESE_ADULTS13', 'POVRATE13_ALL', 'MILK_PRICE10', 'FFRPTH09', 'PCT_DIABETES_ADULTS13'], hue='WHITE10_ABVNATAVG')
<seaborn.axisgrid.PairGrid at 0x117e75898>
Next, although it may seem totally trivial, let's inspect poverty rate vs. median household income. As expected, there's a negative trend, but the exact form of this scatterplot is pretty interesting. The steep rise below the poverty rate of 10% implies that even in low-poverty areas, there's actually a large variability in the overall affluence of the population (ranging from ~40k to over 100k in median household income). Similarly, the long tail in the poverty rate beyond 10% implies that there are issues beyond the median income at play (cost of living is one example). At a fixed poverty rate, counties with higher % of white population seems to have higher median household incomes.
s = sns.scatterplot(x='POVRATE13_ALL', y='MEDHHINC13', hue='PCT_NHWHITE10', palette='Blues', hue_norm=(0,90), data=master_df, s=8)
legend = s.legend()
legend.texts[0].set_text("2010 Non-Hispanic White [%]")
_ = s.set_xlabel('2013 All Ages Poverty Rate [%]')
_ = s.set_ylabel('2013 Median Household Income [$]')
Just for fun, let's select a random subset of the counties and inspect the breakdown in race.
total_rows = np.shape(master_df)[1]
rand_ind = random.sample(range(1,287), 6)
plt.figure(figsize=(15,2.5))
race_df = master_df.iloc[rand_ind,:][['FIPS', 'STATE', 'COUNTY', 'PCT_NHWHITE10', 'PCT_NHBLACK10', 'PCT_HISP10', 'PCT_NHASIAN10', 'PCT_NHNA10']].copy()
race_df['COUNTY_STATE'] = race_df['COUNTY']+', '+race_df['STATE']
race_df = race_df.drop(columns=['FIPS', 'STATE', 'COUNTY'])
race_column_list = race_df.columns.drop('COUNTY_STATE')
race_df = race_df.rename(columns=lambda x: x.split('PCT_')[1].split('10')[0] if x in race_column_list else x)
race_column_list = race_df.columns.drop('COUNTY_STATE').drop('HISP')
race_df = race_df.rename(columns=lambda x: x.split('NH')[1] if x in race_column_list else x)
race_df = pd.melt(race_df, id_vars=['COUNTY_STATE'])
s = sns.barplot(x='COUNTY_STATE', y='value', hue='variable', data=race_df)
leg = plt.legend(bbox_to_anchor=(1.01, 1), loc=2, frameon=False)
s.set_xlabel('County, State')
s.set_ylabel('2010 Population [%]')
Text(0, 0.5, '2010 Population [%]')
Get a list of features that have outliers as determined by the IQR score for reference later. Outliers are understandable and even expected for certain features like ethnicity and population.
for c in master_df.columns[master_df.dtypes == float]:
q1 = master_df[c].quantile(0.25)
q3 = master_df[c].quantile(0.75)
iqr = q3-q1
n_potential_outliers = sum((master_df[c] < (q1-1.5*iqr))|(master_df[c] > (q3+1.5*iqr)))
if n_potential_outliers > 50:
print('{:25}'.format(c), sum((master_df[c] < (q1-1.5*iqr))|(master_df[c] > (q3+1.5*iqr))))
PCT_NHWHITE10 69 PCT_NHBLACK10 420 PCT_HISP10 378 PCT_NHASIAN10 391 PCT_NHNA10 442 PCT_65OLDER10 70 PCT_18YOUNGER10 125 PC_SNAPBEN10 65 PCT_FREE_LUNCH09 58 PCT_REDUCED_LUNCH09 104 PCT_SFSP09 341 PCT_WIC09 159 PCT_CACFP09 146 PCT_LACCESS_POP10 175 PCT_LACCESS_LOWI10 223 PCT_LACCESS_HHNV10 175 PCT_LACCESS_CHILD10 176 PCT_LACCESS_SENIORS10 240 PCT_OBESE_ADULTS08 191 PCT_OBESE_ADULTS13 79 FFRPTH09 71 PC_FSRSALES12 107 GROCPTH09 242 CONVSPTH09 104 SNAPSPTH12 83 WICSPTH08 234 WICSPTH12 233 PCT_LOCLFARM07 237 PCT_LOCLFARM12 224 PCT_LOCLSALE12 423 PC_DIRSALES07 236 PC_DIRSALES12 261 2010_CENSUS_POP 415 POP_EST_2011 418 POP_EST_2012 417 POP_EST_2013 416 POVRATE13_ALL 67 MEDHHINC13 116
We have to do a few more things before we can throw the data at ML algorithms.
We will rename the data to X following the general convention, then split into training and test sets. We'll remove 'PCT_DIABETES_ADULTS' columns because Type II diabetes is strongly associated with obesity ("People who are obese -- more than 20% over their ideal body weight for their height -- are at particularly high risk of developing type 2 diabetes and its related medical problems.") and we are interested in obesity/diabetes as predictions of food environment factors.
X = master_df.drop(axis=1, labels=['FIPS', 'STATE', 'COUNTY', 'PCT_OBESE_ADULTS08', 'PCT_OBESE_ADULTS13', \
'PCT_DIABETES_ADULTS08', 'PCT_DIABETES_ADULTS13'])
y = master_df['PCT_OBESE_ADULTS13']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3)
The features that will be used to predict the obesity rate are listed below. Many of these features appear to be various representations/encodings of poverty/socioeconomic status.
for t in X_train.columns:
try:
variable_lookup(t)
except:
print(t)
PCT_NHWHITE10: % White, 2010 PCT_NHBLACK10: % Black, 2010 PCT_HISP10: % Hispanic, 2010 PCT_NHASIAN10: % Asian, 2010 PCT_NHNA10: % American Indian or Alaska Native, 2010 PCT_65OLDER10: % Population 65 years or older, 2010 PCT_18YOUNGER10: % Population under age 18, 2010 PERPOV10: Persistent-poverty counties, 2010 PERCHLDPOV10: Persistent-child-poverty counties, 2010 METRO13: Metro/nonmetro counties, 2010 POPLOSS10: Population-loss counties, 2010 FOODINSEC_10_12: Household food insecurity (%, three-year average), 2010-12* VLFOODSEC_10_12: Household very low food security (%, three-year average), 2010-12* FOODINSEC_CHILD_01_07: Child food insecurity (% households, multiple-year average), 2001-07* FOODINSEC_CHILD_03_11: Child food insecurity (% households, multiple-year average), 2003-11* REDEMP_SNAPS12: SNAP redemptions/SNAP-authorized stores, 2012 PCT_SNAP12: SNAP participants (% pop), 2012* PC_SNAPBEN10: SNAP benefits per capita, 2010 SNAP_PART_RATE08: SNAP participants (% eligible pop), 2008* SNAP_PART_RATE13: SNAP participants (% eligible pop), 2013* SNAP_OAPP09: SNAP online application, 2009* SNAP_CAP09: SNAP Combined Application Project , 2009* SNAP_BBCE09: SNAP Broad-based Categorical Eligibility, 2009* SNAP_REPORTSIMPLE09: SNAP simplified reporting, 2009* PCT_NSLP09: National School Lunch Program participants (% pop), 2009* PCT_FREE_LUNCH09: Students eligible for free lunch (%), 2009 PCT_REDUCED_LUNCH09: Students eligible for reduced-price lunch (%), 2009 PCT_SBP09: School Breakfast Program participants (% pop), 2009* PCT_SFSP09: Summer Food Service Program participants (% pop), 2009* PCT_WIC09: WIC participants (% pop), 2009* PCT_CACFP09: Child & Adult Care (% pop), 2009* PCT_LACCESS_POP10: Population, low access to store (%), 2010 PCT_LACCESS_LOWI10: Low income & low access to store (%), 2010 PCT_LACCESS_HHNV10: Households, no car & low access to store (%), 2010 PCT_LACCESS_CHILD10: Children, low access to store (%), 2010 PCT_LACCESS_SENIORS10: Seniors, low access to store (%), 2010 MILK_PRICE10: Price of low-fat milk/national average, 2010** SODA_PRICE10: Price of sodas/national average, 2010** MILK_SODA_PRICE10: Price of low-fat milk/price of sodas, 2010** FFRPTH09: Fast-food restaurants/1,000 pop, 2009 PC_FFRSALES07: Expenditures per capita, fast food, 2007* PC_FFRSALES12: Expenditures per capita, fast food, 2012* PC_FSRSALES07: Expenditures per capita, restaurants, 2007* PC_FSRSALES12: Expenditures per capita, restaurants, 2012* GROCPTH09: Grocery stores/1,000 pop, 2009 CONVSPTH09: Convenience stores/1,000 pop, 2009 SNAPSPTH12: SNAP-authorized stores/1,000 pop, 2012 WICSPTH08: WIC-authorized stores/1,000 pop, 2008 WICSPTH12: WIC-authorized stores/1,000 pop, 2012 PCT_LOCLFARM07: Farms with direct sales (%), 2007 PCT_LOCLFARM12: Farms with direct sales (%), 2012 PCT_LOCLSALE12: Direct farm sales (%), 2012 PC_DIRSALES07: Direct farm sales per capita, 2007 PC_DIRSALES12: Direct farm sales per capita, 2012 FARM_TO_SCHOOL09: Farm to school program, 2009 FARM_TO_SCHOOL13: Farm to school program, 2013 2010_CENSUS_POP POP_EST_2011 POP_EST_2012 POP_EST_2013 POVRATE13_ALL MEDHHINC13 POVRATE13_CHILD
Some features are better log'd.
cont_feats = X_train.columns[X_train.dtypes == float]
columns_to_log = ['PCT_NHWHITE10', 'PCT_NHBLACK10', 'PCT_LOCLFARM12'] #from iterating on the model
for c in cont_feats:
q1 = X_train[c].quantile(0.25)
q3 = X_train[c].quantile(0.75)
iqr = q3-q1
if iqr < (0.1)*(max(X_train[c])-min(X_train[c])):
columns_to_log.append(c)
for c in np.unique(columns_to_log):
X_train_min_val = np.median(X_train[c])
X_train = convert2log(c, X_train, replacezero=X_train_min_val)
X_test = convert2log(c, X_test, replacezero=X_train_min_val)
X_train.describe()
PCT_65OLDER10 | FOODINSEC_10_12 | VLFOODSEC_10_12 | FOODINSEC_CHILD_01_07 | FOODINSEC_CHILD_03_11 | REDEMP_SNAPS12 | PCT_SNAP12 | PC_SNAPBEN10 | SNAP_PART_RATE08 | SNAP_PART_RATE13 | ... | LOG10_PCT_NHNA10 | LOG10_PCT_NHWHITE10 | LOG10_PCT_REDUCED_LUNCH09 | LOG10_PC_DIRSALES07 | LOG10_PC_DIRSALES12 | LOG10_POP_EST_2011 | LOG10_POP_EST_2012 | LOG10_POP_EST_2013 | LOG10_WICSPTH08 | LOG10_WICSPTH12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2.200000e+03 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | ... | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 |
mean | 15.880158 | 15.020455 | 5.790818 | 8.596727 | 9.283545 | 2.492448e+05 | 15.066528 | 18.358982 | 68.310000 | 86.266319 | ... | -0.363553 | 1.871002 | 0.951654 | 0.603828 | 0.602346 | 4.449571 | 4.449523 | 4.449678 | -0.681956 | -0.711227 |
std | 4.205627 | 2.360521 | 0.859697 | 1.907775 | 1.806446 | 1.230882e+05 | 3.624165 | 9.446565 | 10.388072 | 8.995722 | ... | 0.538120 | 0.165124 | 0.159644 | 0.532958 | 0.553057 | 0.639121 | 0.640911 | 0.642253 | 0.279336 | 0.260403 |
min | 3.470599 | 8.700000 | 3.200000 | 4.800000 | 5.100000 | 0.000000e+00 | 5.866367 | 1.374386 | 48.000000 | 56.955000 | ... | -1.834039 | 0.426172 | -1.240799 | -2.184197 | -2.658979 | 1.954243 | 1.908485 | 1.949390 | -1.829084 | -1.836582 |
25% | 13.137261 | 13.400000 | 5.100000 | 7.200000 | 7.900000 | 1.629788e+05 | 12.391268 | 11.300859 | 61.000000 | 79.024000 | ... | -0.699490 | 1.826364 | 0.882237 | 0.315982 | 0.290486 | 4.034518 | 4.035820 | 4.033444 | -0.870163 | -0.887251 |
50% | 15.569573 | 14.900000 | 5.700000 | 8.300000 | 9.000000 | 2.457496e+05 | 15.223027 | 16.966789 | 66.000000 | 86.331000 | ... | -0.504170 | 1.933048 | 0.967025 | 0.616968 | 0.635756 | 4.409603 | 4.408698 | 4.409053 | -0.720273 | -0.750353 |
75% | 18.235988 | 16.600000 | 6.500000 | 9.500000 | 10.400000 | 3.267956e+05 | 18.353499 | 23.615687 | 78.000000 | 92.597000 | ... | -0.189132 | 1.973788 | 1.043876 | 0.959782 | 0.964971 | 4.817666 | 4.818016 | 4.818668 | -0.531058 | -0.575330 |
max | 34.129068 | 20.900000 | 8.100000 | 12.600000 | 12.800000 | 1.253321e+06 | 21.899881 | 76.284852 | 100.000000 | 100.000000 | ... | 1.977475 | 1.996350 | 1.658977 | 2.397940 | 2.309440 | 6.717228 | 6.718503 | 6.719355 | 0.664542 | 0.476254 |
8 rows × 53 columns
Next, standardize the features so they are all on the same order of magnitude. Apply the same transformation to the test set. Let's proceed without the binary features 1. for simplicity and 2. the features don't show sensitivity to our target feature.
scaler = preprocessing.MinMaxScaler()
cont_feats = X_train.columns[X_train.dtypes == float]
cat_feats = X_train.columns[X_train.dtypes != float]
scaler.fit(X_train[cont_feats])
X_scaled = pd.DataFrame(scaler.transform(X_train[cont_feats]), \
index=X_train.index, columns=cont_feats)
#X_scaled = X_scaled.merge(X_train[cat_feats], left_index=True, right_index=True)
X_unscaled_test = X_test.copy()
X_test = pd.DataFrame(scaler.transform(X_test[cont_feats]), index=X_test.index, columns=cont_feats)
#X_test = X_test.merge(X_unscaled_test[cat_feats], left_index=True, right_index=True)
X_scaled.describe()
PCT_65OLDER10 | FOODINSEC_10_12 | VLFOODSEC_10_12 | FOODINSEC_CHILD_01_07 | FOODINSEC_CHILD_03_11 | REDEMP_SNAPS12 | PCT_SNAP12 | PC_SNAPBEN10 | SNAP_PART_RATE08 | SNAP_PART_RATE13 | ... | LOG10_PCT_NHNA10 | LOG10_PCT_NHWHITE10 | LOG10_PCT_REDUCED_LUNCH09 | LOG10_PC_DIRSALES07 | LOG10_PC_DIRSALES12 | LOG10_POP_EST_2011 | LOG10_POP_EST_2012 | LOG10_POP_EST_2013 | LOG10_WICSPTH08 | LOG10_WICSPTH12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | ... | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 | 2200.000000 |
mean | 0.404768 | 0.518070 | 0.528738 | 0.486760 | 0.543318 | 0.198868 | 0.573808 | 0.226732 | 0.390577 | 0.680946 | ... | 0.385801 | 0.920169 | 0.756077 | 0.608455 | 0.656411 | 0.523900 | 0.528280 | 0.524173 | 0.460024 | 0.486569 |
std | 0.137177 | 0.193485 | 0.175448 | 0.244587 | 0.234603 | 0.098210 | 0.226037 | 0.126105 | 0.199771 | 0.208984 | ... | 0.141183 | 0.105163 | 0.055054 | 0.116312 | 0.111315 | 0.134185 | 0.133245 | 0.134645 | 0.112020 | 0.112590 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.315302 | 0.385246 | 0.387755 | 0.307692 | 0.363636 | 0.130038 | 0.406954 | 0.132511 | 0.250000 | 0.512696 | ... | 0.297664 | 0.891741 | 0.732138 | 0.545636 | 0.593643 | 0.436759 | 0.442272 | 0.436912 | 0.384549 | 0.410462 |
50% | 0.394637 | 0.508197 | 0.510204 | 0.448718 | 0.506494 | 0.196079 | 0.583569 | 0.208147 | 0.346154 | 0.682449 | ... | 0.348908 | 0.959685 | 0.761377 | 0.611323 | 0.663135 | 0.515509 | 0.519793 | 0.515657 | 0.444658 | 0.469653 |
75% | 0.481609 | 0.647541 | 0.673469 | 0.602564 | 0.688312 | 0.260744 | 0.778814 | 0.296905 | 0.576923 | 0.828017 | ... | 0.431563 | 0.985630 | 0.787880 | 0.686138 | 0.729397 | 0.601183 | 0.604890 | 0.601530 | 0.520538 | 0.545327 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 53 columns
We will remove feature columns with VIF > 10.
Note: Collinearity can result in unstable coefficients for linear regression. Although this is not a big concern for decision trees but feature importance can be harder to interpret.
vif = pd.Series([variance_inflation_factor(add_constant(X_scaled).values, i) \
for i in range(add_constant(X_scaled).shape[1])], index=['const']+X_scaled.columns.tolist())
/Users/jieunchoi/anaconda/envs/myenv/lib/python3.6/site-packages/numpy/core/fromnumeric.py:2389: FutureWarning: Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead. return ptp(axis=axis, out=out, **kwargs)
to_drop = vif.index[vif>10].tolist()
to_drop.remove('const')
It's interesting to observe the correlation matrix as well.
plt.figure(figsize=(11,10))
s = sns.heatmap(X_scaled.corr(), xticklabels=X_scaled.columns, yticklabels=X_scaled.columns)
print(to_drop)
['FOODINSEC_CHILD_01_07', 'FOODINSEC_CHILD_03_11', 'PCT_SNAP12', 'PCT_SBP09', 'PCT_LACCESS_POP10', 'PCT_LACCESS_CHILD10', 'MILK_PRICE10', 'SODA_PRICE10', 'MILK_SODA_PRICE10', 'PC_FFRSALES07', 'PC_FFRSALES12', 'PC_FSRSALES07', 'PC_FSRSALES12', 'POVRATE13_ALL', 'POVRATE13_CHILD', 'LOG10_2010_CENSUS_POP', 'LOG10_POP_EST_2011', 'LOG10_POP_EST_2012', 'LOG10_POP_EST_2013']
X_scaled = X_scaled.drop(columns=to_drop, axis=1)
X_test = X_test.drop(columns=to_drop, axis=1) #apply to the test set also so we don't forget later!
X_scaled.head()
PCT_65OLDER10 | FOODINSEC_10_12 | VLFOODSEC_10_12 | REDEMP_SNAPS12 | PC_SNAPBEN10 | SNAP_PART_RATE08 | SNAP_PART_RATE13 | PCT_NSLP09 | PCT_FREE_LUNCH09 | PCT_SFSP09 | ... | LOG10_PCT_LOCLSALE12 | LOG10_PCT_NHASIAN10 | LOG10_PCT_NHBLACK10 | LOG10_PCT_NHNA10 | LOG10_PCT_NHWHITE10 | LOG10_PCT_REDUCED_LUNCH09 | LOG10_PC_DIRSALES07 | LOG10_PC_DIRSALES12 | LOG10_WICSPTH08 | LOG10_WICSPTH12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3015 | 0.381701 | 0.204918 | 0.306122 | 0.221873 | 0.375637 | 0.711538 | 0.482820 | 0.657248 | 0.477718 | 0.313902 | ... | 0.526186 | 0.395177 | 0.578183 | 0.195719 | 0.991125 | 0.724763 | 0.611169 | 0.663135 | 0.529675 | 0.510779 |
3059 | 0.486861 | 0.450820 | 0.346939 | 0.191609 | 0.142463 | 0.346154 | 1.000000 | 0.525096 | 0.378349 | 0.295754 | ... | 0.723613 | 0.433470 | 0.554845 | 0.306724 | 0.991254 | 0.838837 | 0.905857 | 0.920026 | 0.581093 | 0.603447 |
2142 | 0.478652 | 0.540984 | 0.693878 | 0.304894 | 0.449968 | 0.423077 | 0.539250 | 0.724045 | 0.669043 | 0.078103 | ... | 0.389083 | 0.383781 | 0.764714 | 0.797763 | 0.877798 | 0.774856 | 0.570662 | 0.571589 | 0.504645 | 0.413468 |
1330 | 0.579503 | 0.155738 | 0.326531 | 0.108204 | 0.089784 | 0.250000 | 0.689209 | 0.695890 | 0.333892 | 0.414491 | ... | 0.365171 | 0.663391 | 0.451798 | 0.295962 | 0.970904 | 0.787307 | 0.686941 | 0.752539 | 0.592550 | 0.636105 |
217 | 0.564025 | 0.565574 | 0.510204 | 0.075150 | 0.091523 | 0.000000 | 0.220490 | 0.240939 | 0.294896 | 0.033815 | ... | 0.593569 | 0.495355 | 0.480814 | 0.576048 | 0.957520 | 0.746627 | 0.539264 | 0.678855 | 0.488966 | 0.573735 |
5 rows × 34 columns
Before we proceed, let's first establish a baseline performance to compare with subsequent results. A very simple baseline guess is the median of the target values. We will use two metrics, MAE (mean absolute error, so we want to minimize) and adjusted R2 (MSE/variance in y weighed by the numbers of observations and features, so we want to maximize to 1).
Unlike R2, adjusted R2 will not arbitrary increase with an increasing number of terms if the added complexity in the model does not actually improve the prediction.
Another popular choice is RMSE (root mean square error, i.e., standard deviation) because it's easily and smoothly differentiable, but the squaring makes it pretty sensitive to outliers.
kf = model_selection.KFold(n_splits=5)
mae_list = []
for train_index, val_index in kf.split(X_scaled):
X_train_kf, X_val_kf = X_scaled.iloc[train_index], X_scaled.iloc[val_index]
y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]
y_simple_guess = np.median(y_train_kf)
mae_list.append(metrics.mean_absolute_error(y_val_kf, y_simple_guess*np.ones(len(y_val_kf))))
print("MAE: {:3.2f}".format(np.mean(mae_list)))
MAE: 3.45
We will evaluate the generalized performance on the following ML algorithms, with default choices of the hyperparameters for now.
#choosing default parameters. explicitly specified n_estimators and gamma for RF and SVM to ward off warning messages.
model_list = {'LR':LinearRegression(), 'LASSO':Lasso(), 'RIDGE':Ridge(), 'KNN':KNeighborsRegressor(), \
'GBR':GradientBoostingRegressor(), 'RF':RandomForestRegressor(n_estimators=100), 'SVM':SVR(gamma='scale')}
rn = np.random.randint(0, 500)
for m in model_list.keys():
kf = model_selection.KFold(n_splits=5, random_state=rn)
mae_list = []
adjusted_r2_list = []
for train_index, val_index in kf.split(X_scaled):
X_train_kf, X_val_kf = X_scaled.iloc[train_index], X_scaled.iloc[val_index]
y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]
clf = model_list[m]
clf.fit(X_train_kf, y_train_kf)
y_pred = clf.predict(X_val_kf)
mae_list.append(metrics.mean_absolute_error(y_val_kf, y_pred))
adjusted_r2_list.append(adjusted_r2(y_val_kf, y_pred, np.shape(X_train_kf)[0], np.shape(X_train_kf)[1]))
print("{:4}".format(m)+" ----- ", "MAE: {:3.2f}".format(np.mean(mae_list))+'+/-'+"{:3.2f}".format(np.std(mae_list)), \
" -- R2_adj: {:3.2f}".format(np.mean(adjusted_r2_list)))
LR ----- MAE: 2.37+/-0.10 -- R2_adj: 0.53 LASSO ----- MAE: 3.45+/-0.21 -- R2_adj: -0.02 RIDGE ----- MAE: 2.36+/-0.11 -- R2_adj: 0.54 KNN ----- MAE: 2.21+/-0.07 -- R2_adj: 0.59 GBR ----- MAE: 2.03+/-0.11 -- R2_adj: 0.66 RF ----- MAE: 2.04+/-0.10 -- R2_adj: 0.65 SVM ----- MAE: 2.14+/-0.11 -- R2_adj: 0.61
RF and GBR both seem promising. RF is nicer to interpret, and it gives feature importance, so let's go with it.
Rather than searching the hyperparameter space via grid search (brute-force) or randomized search (slight improvement over grid search), we will turn to a Bayesian optimization technique (tree parzen estimator) via hyperopt.
First define an objective function that returns a loss to be minimized. We will use a cross validation score so that we don't have to split the data further into training & validation sets.
def objective(hparams):
rf = RandomForestRegressor(**hparams)
cv_score = model_selection.cross_val_score(rf, X_scaled, y_train, cv=5, scoring='neg_mean_absolute_error')
loss = -np.mean(cv_score)
#progress bar
pbar.update()
return {'loss': loss, 'hparams': hparams, 'status': STATUS_OK}
Define the characteristics of the hyperparameter space over which we will conduct our search. We will choose uniform distributions for simplicity.
domain = {
"n_estimators": scope.int(hp.quniform("n_estimators",50,150,2)),\
"max_depth": scope.int(hp.quniform('max_depth', 1,10,1)),\
"min_samples_split": scope.int(hp.quniform('min_samples_split', 2,20,1)),\
"min_samples_leaf": scope.int(hp.quniform('min_samples_leaf', 1,10,1)),\
"max_features": hp.uniform('max_features',0.01,1.0),\
"min_impurity_decrease": hp.loguniform("min_impurity_decrease", np.log(1e-4),np.log(0.5)),\
}
Store the training information.
num_iter = 1000
pbar = tqdm_notebook(total=num_iter, desc='Hyperopt')
bayes_trials = Trials()
best = fmin(fn=objective, space=domain, algo=tpe.suggest, \
max_evals=num_iter, trials=bayes_trials)
pbar.close()
HBox(children=(IntProgress(value=0, description='Hyperopt', max=1000, style=ProgressStyle(description_width='i…
100%|██████████| 1000/1000 [2:18:32<00:00, 8.79s/it, best loss: 2.038474812355979]
Hyperopt returns the parameters as floats, so convert the relevant ones back to integer data types.
#just a backup copy
best_bu = best.copy()
hyperparam_list = list(best.keys())
hyperparam_list.pop(hyperparam_list.index("max_features"))
hyperparam_list.pop(hyperparam_list.index("min_impurity_decrease"))
for key in hyperparam_list:
best[key] = int(best[key])
print(best)
{'max_depth': 10, 'max_features': 0.8832777933546546, 'min_impurity_decrease': 0.0006693160693484598, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 142}
plt.figure(1, figsize=(12,5))
for i_k, k in enumerate(bayes_trials.vals.keys()):
ax = plt.subplot('24'+str(i_k+1))
ax.scatter(range(len(bayes_trials.vals[k])), bayes_trials.vals[k], label=k, s=2)
ax.set_ylabel(' '.join(k.split('_')))
ax.set_xlabel('iteration')
ax.axhline(best[k], color='C1', linewidth=2.5)
plt.tight_layout()
ax = plt.subplot('24'+str(7+1))
ax.scatter(range(len(bayes_trials.losses())), bayes_trials.losses(), s=2)
ax.set_ylim(1.9, 3)
ax.set_xlabel('iteration')
ax.set_ylabel('loss')
Text(0, 0.5, 'loss')
Before we move on to deploying the best model, let's construct a learning curve to examinie the bias-variance trade-off.
train_size, train_scores, test_scores = model_selection.learning_curve(RandomForestRegressor(**best), \
X_scaled, y_train, train_sizes=np.linspace(0.1, 1.0, 10),\
cv=5, scoring='neg_mean_absolute_error')
train_scores = np.abs(train_scores)
test_scores = np.abs(test_scores)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
As expected, the test (CV) error is larger than the training error (CV error was computed on previously unseen data). There appears to be some overfitting (difference in MAE of ~1, or error in the obesity rate of 1%), meaning the current model has high variance. Getting more data is not really an option, since we are limited to the # of counties in the U.S. Let's move on to examining the validation curves below and see how individual parameters of our model influence this bias-variance trade off.
plt.plot(train_size, train_scores_mean, linewidth=2.5, label='Training Error')
plt.fill_between(train_size, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.5)
plt.plot(train_size, test_scores_mean, linewidth=2.5, label='CV Error')
plt.fill_between(train_size, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.5)
plt.xlabel('# Training Data')
plt.ylabel('MAE')
leg = plt.legend(loc=1, frameon=False)
plt.ylim(0, 3)
(0, 3)
We will also plot the validation curve for a few hyperparameters.
def plot_validation_curve(p, p_arr, p_label):
train_scores, test_scores = model_selection.validation_curve(RandomForestRegressor(**best), \
X_scaled, y_train, p, p_arr,
cv=5, scoring='neg_mean_absolute_error')
train_scores = np.abs(train_scores)
test_scores = np.abs(test_scores)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.plot(p_arr, train_scores_mean, linewidth=2.5, label='Training Error')
plt.fill_between(p_arr, train_scores_mean-train_scores_std, train_scores_mean+train_scores_std, alpha=0.5)
plt.plot(p_arr, test_scores_mean, linewidth=2.5, label='CV Error')
plt.fill_between(p_arr, test_scores_mean-test_scores_std, test_scores_mean+test_scores_std, alpha=0.5)
plt.xlabel(p_label)
plt.ylabel('MAE')
plt.ylim(0, 3)
leg = plt.legend(loc=1, frameon=False)
First, n_estimators is the number of decision trees. We see that the two errors quickly diverge beyond ~50, and CV errors asymptotes while training error continues to decrease: as we increase the number of estimators, the training set becomes increasingly overfit.
plot_validation_curve('n_estimators', np.arange(10, 100, 20), 'Number of Estimators')
The deeper the individual tree goes, the more it will overfit.
plot_validation_curve('max_depth', np.arange(1,11), 'Maximum Tree Depth')
The fraction of features used in each split in the decision tree.
plot_validation_curve('max_features', np.linspace(0.01, 1, 10), 'Maximum Features')
Note that all CV errors plateau to ~2% regardless of the choice of parameters, # samples, etc.
Train on the full train data set then check the MAE and the adjusted R2 scores on the test set.
rf_tuned = RandomForestRegressor(**best)
rf_tuned.fit(X_scaled, y_train)
y_pred = rf_tuned.predict(X_test)
print("{:4}".format('RF')+" ----- ", "MAE: {:3.2f}".format(metrics.mean_absolute_error(y_test, y_pred)), \
" -- R2_adj: {:3.2f}".format(adjusted_r2(y_test, y_pred, np.shape(X_test)[0], np.shape(X_test)[1])))
RF ----- MAE: 2.01 -- R2_adj: 0.68
In terms of mean absolute error, this is a nearly 70% improvement over the naive baseline model (median)!
Let's examine which features played a crucial role in predicting the obesity rates. The feature importance of a single predictor variable is defined by combining the following: the fraction of the samples split according to the said variable—a measure of its "sphere of influence," if you will—and the decrease in Gini impurity resulting from the said split. Feature importances are computed for a given decision tree, and these values are averaged over the ensemble of trees to provide the final list of feature importances that we will examine here.
The most importance predictors by far are related to socioeconomic status.
feat_imp = rf_tuned.feature_importances_
feat_imp_ind = np.argsort(feat_imp)[-20:]
s = sns.barplot(x=feat_imp[feat_imp_ind], y=X_scaled.columns[feat_imp_ind])