#!/usr/bin/env python # coding: utf-8 # # Exercise 02 # # Estimate a regression using the Income data # # # ## Forecast of income # # We'll be working with a dataset from US Census indome ([data dictionary](https://archive.ics.uci.edu/ml/datasets/Adult)). # # Many businesses would like to personalize their offer based on customer’s income. High-income customers could be, for instance, exposed to premium products. As a customer’s income is not always explicitly known, predictive model could estimate income of a person based on other information. # # Our goal is to create a predictive model that will be able to output an estimation of a person income. # In[5]: import pandas as pd import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # read the data and set the datetime as the index import zipfile with zipfile.ZipFile('../datasets/income.csv.zip', 'r') as z: f = z.open('income.csv') income = pd.read_csv(f, index_col=0) income.head() # In[6]: income.shape # # Exercise 2.1 # # What is the relation between the age and Income? # # For a one percent increase in the Age how much the income increases? # # Using sklearn estimate a linear regression and predict the income when the Age is 30 and 40 years # In[3]: income.plot(x='Age', y='Income', kind='scatter') # In[ ]: # # Exercise 2.2 # Evaluate the model using the MSE # In[ ]: # # # Exercise 2.3 # # Run a regression model using as features the Age and Age$^2$ using the OLS equations # In[ ]: # # Exercise 2.4 # # # Estimate a regression using more features. # # How is the performance compared to using only the Age? # In[ ]: # # Exercise 2.5 # # # Estimate a logistic regression to predict if a person is in the United States. # # What is the performance of the model # In[10]: income['isUS'] = (income['Country'] == 'United-States')*1.0 income['isUS'].value_counts() # In[ ]: