from os import mkdir, path, getcwd
from random import shuffle, seed
from urllib.request import urlretrieve
from requests import get
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import rc
from sklearn import preprocessing
from sklearn import linear_model
import seaborn as sns
%matplotlib notebook
from IPython.display import HTML
dirname = "../srcimages/"
if not path.exists(dirname):
mkdir(dirname)
imageformat = '.pdf'
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Computer Modern Roman']})
params = {'axes.labelsize': 12,
'font.size': 12,
'legend.fontsize': 12,
'xtick.labelsize': 10,
'ytick.labelsize': 10,
'text.usetex': True,
'figure.figsize': (8, 6)}
plt.rcParams.update(params)
sns.set_context("poster")
sns.set_palette("colorblind")
sns.axes_style()
sns.set_style({'legend.frameon': True})
color_blind_list = sns.color_palette("colorblind", 8)
my_orange = color_blind_list[2]
my_green = color_blind_list[1]
my_blue = color_blind_list[0]
plt.close("all")
The Current Population Survey (CPS) is a monthly survey of about 57,000 U.S. households conducted by the Bureau of the Census of the Bureau of Labor Statistics. The CPS is the primary source of information on the labor force characteristics of the U.S. population. The survey covers employment, earnings, educational attainment, income, poverty, health insurance coverage, job experience, voting and registration, computer usage, veteran status, and other variables.
Details can be found at https://www.census.gov/programs-surveys/cps.html and https://dataferrett.census.gov/
From the March 2009 survey we extracted the individuals with non-allocated variables who were full-time employed (defined as those who had worked at least 36 hours per week for at least 48 weeks the past year), and excluded those in the military. This sample has 50,742 individuals. We extracted 12 variables from the CPS on these individuals.
age: years, capped at 85
female: 1 Female, 0 otherwise
hisp: 1 if Spanish, Hispanic, or Latino, 0 otherwise
education:
earnings: total annual wage and salary earnings
hours: number of hours worked per week
week: number of weeks worked per year
union: 1 for member of a labor union, 0 otherwise
uncov: 1 if covered by a union or employee association contract, 0 otherwise
region:
Race:
marital:
# information on dataset:
# https://www.ssc.wisc.edu/~bhansen/econometrics/cps09marReadMe.pdf
URL = 'https://www.ssc.wisc.edu/~bhansen/econometrics/'
def get_cps09mar_data(url=URL, filename='cps09mar.txt'):
URL_full = URL + filename
target_path = getcwd() +'/' + filename # your local directory
if not path.exists(filename):
response = get(URL_full, stream=True)
handle = open(target_path, "wb")
for chunk in response.iter_content(chunk_size=512):
if chunk: # filter out keep-alive new chunks
handle.write(chunk)
u_cols = ['age', 'female', 'hisp', 'education', 'earnings', 'hours',
'week', 'union', 'uncov', 'region', 'race', 'marital']
data = pd.read_csv('cps09mar.txt', sep=r"\s+",
names=u_cols, na_values='NA', dtype=np.float64)
return data
data = get_cps09mar_data(url=URL, filename='cps09mar.txt')
# To get correct display format
pd.options.display.float_format = '{:,.3f}'.format
# for this dataset na_values are marked as NA.
n_samples_ini = data.shape[0]
print("The original dataset has {} samples and {} features".format(
data.shape[0], data.shape[1]))
# Remove NA:
data = data.dropna(axis=0, how='any')
data.head()
# TODO : XXX Figure 2.1: Wage Distribution and Density. All full-time U.S. workers
# Adding log-hourwage
y = np.log(data['earnings'] / (data['hours'] * data['week']))
print(y.shape)
data['log-hourwage'] = y
# If high education needed:
# data['high-education'] = (data['education']> 9).astype(float)
data.head()
male = data['female'] == 0.
female = data['female'] == 1.
n_male = np.sum(male)
n_female = np.sum(female)
print("In the dataset, there are: \n n_male = {} male \n n_female = {} female.".format(
n_male, n_female))
Or use groupby:
data.groupby('female').size()
print(data['education'].min())
print(data['education'].max())
Or use describe method:
data['education'].describe()
xgrid = np.linspace(data['education'].min() * 0.9,
data['education'].max() * 1.1)
fig1 = plt.figure(figsize=(8, 5))
kde_male = KDEUnivariate(data['education'][male])
kde_male.fit(bw=2, kernel='gau')
pdf_est_male = kde_male.evaluate(xgrid)
plt.plot(xgrid, pdf_est_male, color=my_blue, label='Male')
plt.fill_between(xgrid, 0, pdf_est_male, facecolor=my_blue, alpha=0.5)
kde_female = KDEUnivariate(data['education'][female])
kde_female.fit(bw=2, kernel='gau')
pdf_est_female = kde_female.evaluate(xgrid)
plt.plot(xgrid, pdf_est_female, color=my_orange, label='Female',)
plt.fill_between(xgrid, 0, pdf_est_female, facecolor=my_orange, alpha=0.5)
plt.legend()
Here we apply a simpel 1D OLS method with intercept to the explain the log(wage) w.r.t. eduction (the number of education year.
skl_lm = linear_model.LinearRegression(fit_intercept=True)
X_uni = data['education'].values.reshape(-1, 1)
skl_lm.fit(X_uni, y)
plt.figure()
plt.plot(xgrid.reshape(-1, 1), skl_lm.predict(xgrid.reshape(-1, 1)))
plt.plot(X_uni, y, '.')
plt.xlabel('Eduction (years)')
plt.ylabel('log(wage)')
# Improved visualisation using jittering:
jitter = 0.05 # to avoid overlap
plt.figure()
plt.plot(xgrid.reshape(-1, 1), skl_lm.predict(xgrid.reshape(-1, 1)))
plt.plot(X_uni.squeeze() + jitter * np.random.randn(X_uni.shape[0]), y, '.')
plt.xlabel('Eduction (years)')
plt.ylabel('log(wage)')
plt.title('Raw data: log(wage) prediction according to eduction level')
the observation are average (groupby) according to the number of education years.
y_reduced = data['log-hourwage'].groupby(data['education']).mean().values
x_reduced = np.sort(data['education'].unique()).reshape(-1, 1)
n_education = x_reduced.shape[0]
y_pred_reduced = skl_lm.fit(x_reduced, y_reduced).predict(x_reduced)
plt.figure()
plt.plot(x_reduced, y_pred_reduced)
plt.plot(x_reduced, y_reduced, '.')
plt.xlabel('Eduction (years)')
plt.ylabel('log(wage)')
plt.title('Averaged data: log(wage) prediction according to eduction level')
skl_lm = linear_model.LinearRegression(fit_intercept=True)
X_uni = data.copy()['female'].values.reshape(-1, 1)
print(X_uni.shape)
skl_lm.fit(X_uni, y)
jitter = 0.01
plt.figure()
xgrid_bin = np.array([0, 1])
plt.plot(xgrid_bin.reshape(-1, 1), skl_lm.predict(xgrid_bin.reshape(-1, 1)), label="OLS")
plt.plot(X_uni + jitter * np.random.randn(X_uni.shape[0]).reshape(-1, 1) , y, '.', label="data")
plt.xlim(-0.5, 1.5)
plt.xticks([0, 1])
plt.xlabel('Gender (M=0, F=1)')
plt.title("log(wage) prediction according to gender")
plt.legend(loc=1)
print("Average log(wage):\
\n for a male it is {:0.4}\
\n for a female it is {:0.4}".format(
skl_lm.predict(xgrid_bin[0])[0], skl_lm.predict(xgrid_bin[1])[0]))
skl_lm = linear_model.LinearRegression(fit_intercept=True)
X_uni = data[['education', 'female']].values
skl_lm.fit(X_uni, y)
X_to_predic_male = np.concatenate(
[x_reduced, np.zeros([n_education, 1])], axis=1)
X_to_predic_female = np.concatenate(
[x_reduced, np.ones([n_education, 1])], axis=1)
plt.figure(figsize=(6, 6))
ylims = [-2., 6.]
plt.plot(x_reduced, skl_lm.predict(X_to_predic_male), label='male', c=my_blue)
plt.plot(x_reduced, skl_lm.predict(
X_to_predic_female), label='female', c=my_orange)
jitter = 0.1 # to avoid overlap
plt.plot(data['education'][male] + jitter *
np.random.randn(n_male), y[male], '.', c=my_blue, ms=2, alpha=0.5)
plt.plot(data['education'][female] + jitter *
np.random.randn(n_female), y[female], '.', c=my_orange, ms=2, alpha=0.5)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, ylims[0], ylims[1]))
# checking the lines equations are correct:
plt.plot(x_reduced, skl_lm.intercept_ + x_reduced * skl_lm.coef_[0], c=my_blue)
plt.plot(x_reduced, skl_lm.intercept_ +
skl_lm.coef_[1] + x_reduced * skl_lm.coef_[0], c=my_orange)
plt.xlabel('Year of education')
plt.ylabel('log(hourly wage)')
plt.legend()
*Conclusion*: adding a categorical variable "female" allows to have a different intercept for the two population (Male and Female). Note that we made the choice to use only "female" for that. This is checked as the lines displayed:
are identical to the prediction given by the model. We can check that in the US population male are expected to have a higher wage.
skl_lm = linear_model.LinearRegression(fit_intercept=True)
X_uni = data.copy()[['education', 'female', 'marital']]
married = X_uni['marital'] == 1
n_married = np.sum(married)
skl_lm.fit(X_uni, y)
X_to_predic_male_married = np.concatenate(
[x_reduced, np.zeros([n_education, 1]), np.ones([n_education, 1])], axis=1)
X_to_predic_male_nonmarried = np.concatenate(
[x_reduced, np.zeros([n_education, 1]), np.zeros([n_education, 1])], axis=1)
X_to_predic_female_married = np.concatenate(
[x_reduced, np.ones([n_education, 1]), np.ones([n_education, 1])], axis=1)
X_to_predic_female_nonmarried = np.concatenate(
[x_reduced, np.ones([n_education, 1]), np.zeros([n_education, 1])], axis=1)
plt.figure(figsize=(6, 6))
plt.plot(x_reduced, skl_lm.predict(X_to_predic_male_married),
'-', lw=1, label='male-married', c=my_blue)
plt.plot(x_reduced, skl_lm.predict(X_to_predic_male_nonmarried),
'--', lw=1, label='male-nonmarried', c=my_blue)
plt.plot(x_reduced, skl_lm.predict(X_to_predic_female_married),
'-', lw=1, label='female-married', c=my_orange)
plt.plot(x_reduced, skl_lm.predict(X_to_predic_female_nonmarried),
'--', lw=1, label='female-nonmarried', c=my_orange)
x1, x2, y1, y2 = plt.axis()
plt.axis((x1, x2, ylims[0], ylims[1]))
jitter = 0.1 # to avoid overlap
plt.plot(data['education'][male] + jitter *
np.random.randn(n_male), y[male], '.', c=my_blue, ms=2, alpha=0.5)
plt.plot(data['education'][female] + jitter *
np.random.randn(n_female), y[female], '.', c=my_orange, ms=2, alpha=0.5)
plt.xlabel('Year of education')
plt.ylabel('log(hourly wage)')
plt.legend()
print(skl_lm.coef_)
This is equivalent to:
$$ log(wage) \approx \mu_{0,0} + \theta_1 education + \mu_{1,0} married \cdot male + \mu_{1,1} married \cdot female$$with:
and the linkes:
theta0 = skl_lm.intercept_
theta1 = skl_lm.coef_[0]
theta2 = skl_lm.coef_[1]
theta3 = skl_lm.coef_[2]
print(' theta0 = {:.4}\n theta1 = {:.4}\n theta2 = {:.4}\n theta3 = {:.4}'.format(
theta0, theta1, theta2, theta3))
mu00 = theta0
mu01 = theta0 + theta3
mu10 = theta0 + theta2
mu11 = theta0 + theta2 + theta3
print(' mu00 = {:.4} (non-married male)\
\n mu01 = {:.4} (married male)\
\n mu10 = {:.4} (non-married female)\
\n mu11 = {:.4} (married female)'.format(
mu00, mu01, mu10, mu11))
*Conclusion*: we can check that the influence of being married is correlated with a lower salary (XXX: more is needed to understand this point here...)
x_low_edu = (x_reduced < 9).astype(float)
X = np.concatenate([x_reduced, (x_reduced - 9) * (x_low_edu)], axis=1)
skl_lm = linear_model.LinearRegression(fit_intercept=True)
skl_lm.fit(X, y_reduced)
plt.figure()
plt.plot(x_reduced, skl_lm.predict(X), label="global OLS")
plt.plot(x_reduced, y_reduced, '.')
plt.plot(x_reduced, y_pred_reduced, '--', label="OLS with threshold level")
plt.xlabel('Eduction (years)')
plt.ylabel('log(wage)')
plt.title('Averaged data: log(wage) prediction according to eduction level')
plt.legend()
skl_lm = linear_model.LinearRegression(fit_intercept=True)
scores = np.zeros(n_education)
for i, val in enumerate(x_reduced.squeeze()):
x_low_edu_loop = (x_reduced < val).astype(float)
X = np.concatenate([x_reduced, (x_reduced - val)
* (x_low_edu_loop)], axis=1)
skl_lm.fit(X, y_reduced)
scores[i] = skl_lm.score(X, y_reduced)
print('Best choice (for R^2 score) on training part was to choose a kink at {}.'.format(
x_reduced[scores.argmax()][0]))
# some help for groupby: http://wesmckinney.com/blog/groupby-fu-improvements-in-grouping-and-aggregating-data-in-pandas/
tab = data['log-hourwage'].groupby([data['education'], data['female']]).mean()
y_reduced_female = tab.values[1::2]
y_reduced_male = tab.values[::2]
X = np.concatenate([x_reduced, (x_reduced - 9) * (x_low_edu)], axis=1)
skl_lm_female = linear_model.LinearRegression(fit_intercept=True)
skl_lm_female.fit(X, y_reduced_female)
skl_lm_male = linear_model.LinearRegression(fit_intercept=True)
skl_lm_male.fit(X, y_reduced_male)
plt.figure()
plt.plot(x_reduced, skl_lm_female.predict(X), label="female OLS w/o lowedu", c=my_orange)
plt.plot(x_reduced, skl_lm_male.predict(X), label="male OLS w/o lowedu", c=my_blue)
plt.plot(x_reduced, y_reduced_female, '.', c=my_orange, alpha=0.5)
plt.plot(x_reduced, y_reduced_male, '.', c=my_blue, alpha=0.5)
# plt.plot(x_reduced, y_pred_reduced,'--',label="OLS with threshold level")
plt.xlabel('Eduction (years)')
plt.ylabel('log(wage)')
plt.title('Averaged data: log(wage) prediction according to eduction level')
plt.legend()