A common claim made by many is that immigration pushes up housing prices. Moreover, sometimes these claims are targeted to immigrants with a specific origin. An example is that Chinese immigrants causes housing prices to rise.
In this project, I use annual country-specific Lawful Permanent Residents (PR) data to the United States and housing data to find evidence for/against these claims. I target to answer 2 questions:
I use linear regression models to estimate correlations between appropriate variables. My findings are:
As a bonus thing that I got from the data, I estimated correlations of PR admission with prior years of housing prices. Interstingly, PR admission is positively correlated with housing prices 6 years ago and negatively correlated with housing prices 7 years ago.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import qeds
import geopandas as gpd
from pathlib import Path
folder = Path(r"C:\Users\YuchengLiu\Documents\323")
pd.__version__
'1.0.3'
import json
from bokeh.plotting import figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import GeoJSONDataSource, LinearColorMapper, LogColorMapper
from bokeh.models import ColorBar, HoverTool, LogTicker
from bokeh.palettes import brewer
output_notebook()
import seaborn as sns
import sklearn
from sklearn import linear_model
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
The housing data is monthly. I turned it to yearly data by averaging.
years = range(2004,2019)
# import housing file
housing_file = folder / 'housing_by_metro.csv'
housing = pd.read_csv(housing_file)
metro_housing = housing['RegionName']
housing=housing.set_index('RegionName').drop(columns=['RegionID','SizeRank'])
# take year average
housing.columns=pd.to_datetime(housing.columns)
housing=housing.T.reset_index().groupby(pd.Grouper(key='index',freq='Y')).mean().T
housing.columns.name = 'year'
log_housing = housing.apply(np.log)
#housing.head()
def get_metro_map(dfa,metro_housing):
'''search the housing file to match metro names'''
metro_pr = dfa.index
d = {}
for metro in metro_pr:
if metro == 'Total':
d[metro]='United States'
continue
if (metro_housing==metro).sum() != 0:
#print(metro)
#print((metro_housing==metro).sum())
d[metro]=metro
else:
x = set()
end = metro.find(',')
cities = metro[:end].split('-')
states = metro[end+1:].strip().split('-')
#print(cities)
#print(states)
#print()
for city in cities:
for state in states:
if city != '':
#print(city)
a = metro_housing.str.contains(city)
b = metro_housing.str.contains(state)
s = a&b
values = metro_housing[s[s].index].values
#print(values)
x = x.union(metro_housing[s[s].index].values)
if x:
#print(len(x), end=' ')
#if len(x)!=1:
#print(metro, x)
d[metro] = x.pop() # empirically, unique
d['Louisville/Jefferson County, KY-IN']='Louisville-Jefferson County, KY'
return d
def load_code_file():
df = pd.read_csv(folder / 'cbsa_to_county.csv')
df = df[['cbsacode','cbsatitle']][1:].drop_duplicates()
df['cbsacode']=df['cbsacode'].astype(int)
df = df.set_index('cbsatitle')
mapping = get_metro_map(df,metro_housing)
df.index = df.index.map(mapping)
return df
code_df=load_code_file()
#code_df
The PR data has a file for each year. I cleaned them individually and append them together.
# import PR file
def get_year(year, matched_names, common_names, country_level=False):
'''load and process PR file for a single year'''
pr_folder_name = folder / 'pr_tables'
for file in pr_folder_name.iterdir():
#print(file.name)
if(file.name.startswith(str(year))):
break
# extract the data
header = 3 if year > 2012 else 4
df = pd.read_excel(file, header=header)
dfa= df.loc[10:] if country_level else df.loc[1:8]
# remove footnotes
dfa=dfa.set_index('Region and country of birth')
last=dfa.index.get_loc('Unknown')
dfa=dfa[:last].T
# match metro names with housing data
mapping = get_metro_map(dfa,metro_housing)
matched_names = matched_names.union(mapping.values())
common_names = common_names.intersection(mapping.values())
dfa.index = dfa.index.map(mapping)
dfa=dfa[dfa.index.notnull()]
dfa=dfa.replace({'D':0,'-':0}).astype(float)
# merge with cbsa code
dfa = (dfa.reset_index()
.merge(code_df.reset_index(),left_on='index',right_on='cbsatitle',how='left')
.set_index('index')
.drop('cbsatitle',axis=1))
dfa.loc['United States','cbsacode']=0 # put 0 for U.S.
# add year
dfa['year']=pd.to_datetime(f"{year}-12-31")
# fix some names
dfa=dfa.rename(columns={'Korea':'Korea, South', 'Antigua-Barbuda':'Antigua and Barbuda'})
return dfa, matched_names, common_names
# append all the years together
pr = pd.DataFrame()
matched_names = set()
common_names = set(metro_housing)
for year in range(2004,2019):
print('-',end='')
dta, matched_names, common_names = get_year(year, matched_names, common_names, True)
pr = pr.append(dta)
pr = pr.dropna(axis=1)
print('Done loading PR files!')
---------------Done loading PR files!
These states and cbsa's dataframs are for mapping.
def load_state_df():
df = gpd.read_file('https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_5m.zip')
df['cbsacode'] = 0.0
df.set_index('STATEFP')
return df[['NAME','cbsacode','geometry']]
state_df=load_state_df()
#state_df
def load_cbsa_df():
df = gpd.read_file("https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_cbsa_5m.zip")
df['cbsacode'] = df['CBSAFP'].astype(float)
return df[['cbsacode','geometry']]
cbsa_df=load_cbsa_df()
#cbsa_df
First, I give a visualization of the Permanent Residence data.
def graph_pr_bokeh(df1,df2, year, paint_according_to='Total'):
#paint_according_to='Japan'
geojson1=GeoJSONDataSource(geojson=df1.to_json())
geojson2=GeoJSONDataSource(geojson=df2.to_json())
if paint_according_to in {'Total','China','Mexico','Russia'}:
color_mapper = LogColorMapper(palette = brewer['YlOrRd'][9][::-1])
else:
color_mapper = LinearColorMapper(palette = brewer['YlOrRd'][9][::-1])
color_bar = ColorBar(color_mapper=color_mapper, ticker=LogTicker(),
label_standoff=12, height = 20,
border_line_color=None, location = (0,0), orientation = 'horizontal')
title=f"Persons Obtaining PR Status by Region and Country of Birth in {year}"
if paint_according_to != 'Total':
title += f', colored by persons from {paint_according_to}'
p = figure(title=title, plot_width=750, plot_height=500, tools="pan,box_zoom,reset,previewsave")
hover_off=p.patches("xs","ys",source=geojson1, fill_color='white', line_color='grey')
hover_on =p.patches("xs","ys",source=geojson2, line_color='black',
fill_color = {'field' :paint_according_to, 'transform' : color_mapper})
tooltips = [('Region','@index'), ('Total' ,'@Total'), ('Canada','@Canada'),
('China' ,'@China'), ('Mexico','@Mexico'),]
temp = (paint_according_to, '@'+paint_according_to)
if temp not in tooltips:
tooltips.append(temp)
hover = HoverTool(tooltips = tooltips, renderers=[hover_on])
p.add_tools(hover)
#p.add_layout(color_bar, 'below')
show(p)
def graph_pr(year=2018, paint_according_to='Total', bokeh=True):
year2 = f"{year}-12-31"
data = (pr[pr['year']==year2]
.drop(index=['Urban Honolulu, HI'])
.reset_index()
.drop(columns='year')
.rename(columns={"China, People's Republic":'China'}))
# First draw US, then draw the metros
'''{
27:Hawaii,
39:US Virgin Islands,
47:Guam,
48:Commonwealth of the Northern Mariana Islands,
49:American Samoa,
51:Puerto Rico,
55:Alaska}'''
df1 = state_df.drop(index=[27,39,47,48,49,51,55]).merge(data, on='cbsacode', how='left')
df2 = cbsa_df.merge(data,on='cbsacode', how='inner')
if bokeh:
graph_pr_bokeh(df1, df2, year, paint_according_to)
else:
fig, gax = plt.subplots(figsize=(15, 10))
df1.plot(ax=gax, alpha=0.3)
df2.plot(ax=gax, edgecolor="black", color="white")
#return df1,df2
graph_pr(2018)
graph_pr(2018, 'China')
See Appendex 1 for maps colored with other countries.
Next, I present how PR admissions change over time.
# plot the PR data over time, find the top 4 countries
def plot_pr_over_time(region='United States', n=4, percent=False, keep_total=False):
'''plots the percentage of immigrants over time for top n countries in all years,
so there might be more than n countries plotted, default n=4'''
def top_regions(data, num=5):
return set.union(*list(data.agg(lambda r:set(r.nlargest(num).index),axis=1)))
data = pr.loc[region].drop(columns=['cbsacode']).set_index('year')
regions = top_regions(data, n+1)
#print(regions, len(regions), sep='\n')
data = data[regions]
relative = data.div(data['Total'], axis=0)
if not keep_total:
data = data.drop(columns='Total')
relative = relative.drop(columns='Total')
fig, ax = plt.subplots(1, 1, figsize=(10,6))
title = f'Persons Obtaining PR Status with Residence in {region}, Top Countries of Birth'
if percent:
relative.plot(ax=ax, title=title)
ax.yaxis.set_major_formatter(PercentFormatter(1))
else:
data.plot(ax=ax, title=title)
ax.legend(bbox_to_anchor=(1, 0.5))
ax.xaxis.grid(True, alpha=0.3)
plot_pr_over_time('Los Angeles-Long Beach-Anaheim, CA',3)
#plot_pr_over_time('San Francisco, CA')
#plot_pr_over_time('Seattle, WA')
#plot_pr_over_time('Washington, DC')
plot_pr_over_time('New York, NY')
#plot_pr_over_time('Chicago, IL')
plot_pr_over_time('Miami-Fort Lauderdale, FL')
#plot_pr_over_time()
The distribution of PR admissions is very different across metropolitans.
# reduce housing dataframe
housing = housing.loc[matched_names]
log_housing = log_housing.loc[matched_names]
# plotting housing
def plot_housing(regions=None):
if regions==None:
regions = ['United States','Los Angeles-Long Beach-Anaheim, CA',
'San Francisco, CA', 'New York, NY','Seattle, WA']
if 'United States' not in regions:
regions.append('United States')
log_housing_change = log_housing.T - log_housing.T.shift(1)
rel_log_housing_change = log_housing_change.subtract(log_housing_change['United States'],axis=0)
fig, ax = plt.subplots(2,1,figsize=(10,10))
housing.T[regions].plot(ax=ax[0], title='Zillow Home Value Index (ZHVI) by Region')
rel_log_housing_change[regions].plot(ax=ax[1],sharex = ax[0],
title='Percentage Change in ZHVI, Normalized to National Average')
#ax[0].get_legend().remove()
ax[1].get_legend().remove()
ax[0].xaxis.grid(True, alpha=0.3)
ax[1].xaxis.grid(True, alpha=0.3)
ax[0].legend(bbox_to_anchor=(1, 0.5))
plot_housing()
You can see different US metropolitans have different housing price trajectories. I try to explain these differences by PR admissions.
Let's now analyze the first question. Intuitively, more immigration admission leads to more demand of housing, which pushes up housing prices. What does the data say?
# Merging the datasets
log_housing2 = log_housing.stack().reset_index().rename(columns={0:'log_price'})
df = pr.reset_index().merge(log_housing2, left_on=['index','year'],
right_on=['RegionName','year']).set_index('RegionName').drop(columns=['index'])
df['log_total']=df['Total'].apply(np.log)
df2=df.drop(index='United States')
# first plot
sns.lmplot(data=df2, x='log_total',y='log_price',scatter_kws=dict(alpha=0.4))
<seaborn.axisgrid.FacetGrid at 0x19624288388>
The fitted line is upward slopping, indicating a positive correlation between log total immigration and log housing prices.
We can check the coefficient:
lr_model = linear_model.LinearRegression()
lr_model.fit(df2[['log_total']], df2['log_price'])
beta_0 = lr_model.intercept_
beta_1 = lr_model.coef_[0]
print(f"Fitted model: log(price) = {beta_0:.3f} + {beta_1:.3f} log(Total)")
Fitted model: log(price) = 10.606 + 0.195 log(Total)
This naive linear regression predicts every percent increase in PR admission is correlated with 19.5% increase in housing prices.
To investigate the second question, I use a more complicated linear regression models to estimate the correlation between number of Chinese immigrants and log housing prices. I do this by creating a dummy variable for being born in China. I still keep the overall immigration admission variable log_total; I also have region and year dummies as controls.
def scatter_plot(df2, X, *models, x='log_total', y='log_price'):
'''This fuction plots the data and predictions of the models'''
colors = qeds.themes.COLOR_CYCLE
def scatter(df, x='log_total', y='log_price', ax=None, figsize=(12, 8)):
if ax is None:
_, ax = plt.subplots(figsize=figsize)
df.plot.scatter(x=x , y=y, alpha=0.4, ax=ax)
return ax
def scatter_model(model, X, ax=None, x='log_total', y='log_price', figsize=(12, 8), **args):
if ax is None:
_, ax = plt.subplots(figsize=figsize)
ax.scatter(X[x], model.predict(X), alpha=0.4, **args)
ax = scatter(df2, x=x , y=y)
for (i,model) in enumerate(models):
scatter_model(model, X, ax, color=colors[i+1], x=x , y=y)
ax.legend(["Data", "OLS", "Lasso"])
def regress(*countries_of_interest, year_controls=True, region_controls=True):
countries_of_interest=list(countries_of_interest)
data = df2.reset_index()
if 'all' in countries_of_interest:
countries_of_interest=list(
data.drop(columns=['log_price','Total','cbsacode','year','RegionName','log_total']))
# Prepare X variable
X = data.loc[:,countries_of_interest]
X['Other'] = data['Total'] - data[countries_of_interest].sum(axis=1)
X['log_total'] = data['log_total']
if year_controls:
year_dummies = pd.get_dummies(data['year'], drop_first=True)
X = X.join(year_dummies)
if region_controls:
region_dummies = pd.get_dummies(data['RegionName'], drop_first=True)
X = X.join(region_dummies)
X = add_constant(X)
# y variable
y = data['log_price']
# regressions
ols = sm.OLS(y, X)
ols_result = ols.fit()
lasso_result = ols.fit_regularized(alpha=5e-4,L1_wt=1)
scatter_plot(data, X, ols_result)#, lasso_result)
return ols_result, lasso_result
#a,b = regress('Mexico',"China, People's Republic", "United Kingdom",'India','France')
a,b= regress("China, People's Republic")
#a,b= regress("all")
By inspection, the model fits the data pretty well. In fact, its R-squared is 0.961:
a.summary().tables[0]
Dep. Variable: | log_price | R-squared: | 0.961 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.956 |
Method: | Least Squares | F-statistic: | 211.2 |
Date: | Sat, 18 Apr 2020 | Prob (F-statistic): | 0.00 |
Time: | 16:04:10 | Log-Likelihood: | 666.53 |
No. Observations: | 728 | AIC: | -1179. |
Df Residuals: | 651 | BIC: | -825.6 |
Df Model: | 76 | ||
Covariance Type: | nonrobust |
Let's see the coefficients:
a.summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 9.0173 | 0.369 | 24.426 | 0.000 | 8.292 | 9.742 |
China, People's Republic | 7.675e-06 | 5.71e-06 | 1.345 | 0.179 | -3.53e-06 | 1.89e-05 |
Other | -4.312e-07 | 1.3e-06 | -0.332 | 0.740 | -2.98e-06 | 2.12e-06 |
log_total | 0.3029 | 0.038 | 7.889 | 0.000 | 0.227 | 0.378 |
2005-12-31 00:00:00 | 0.0696 | 0.022 | 3.174 | 0.002 | 0.027 | 0.113 |
2006-12-31 00:00:00 | 0.1231 | 0.023 | 5.341 | 0.000 | 0.078 | 0.168 |
2007-12-31 00:00:00 | 0.1751 | 0.021 | 8.165 | 0.000 | 0.133 | 0.217 |
2008-12-31 00:00:00 | 0.0554 | 0.022 | 2.559 | 0.011 | 0.013 | 0.098 |
2009-12-31 00:00:00 | -0.0743 | 0.022 | -3.418 | 0.001 | -0.117 | -0.032 |
2010-12-31 00:00:00 | -0.0757 | 0.021 | -3.543 | 0.000 | -0.118 | -0.034 |
2011-12-31 00:00:00 | -0.1568 | 0.022 | -7.254 | 0.000 | -0.199 | -0.114 |
2012-12-31 00:00:00 | -0.1608 | 0.022 | -7.466 | 0.000 | -0.203 | -0.119 |
2013-12-31 00:00:00 | -0.0460 | 0.021 | -2.160 | 0.031 | -0.088 | -0.004 |
2014-12-31 00:00:00 | 0.0171 | 0.021 | 0.800 | 0.424 | -0.025 | 0.059 |
2015-12-31 00:00:00 | 0.0508 | 0.022 | 2.346 | 0.019 | 0.008 | 0.093 |
2016-12-31 00:00:00 | 0.0730 | 0.023 | 3.237 | 0.001 | 0.029 | 0.117 |
2017-12-31 00:00:00 | 0.1463 | 0.022 | 6.530 | 0.000 | 0.102 | 0.190 |
2018-12-31 00:00:00 | 0.2171 | 0.022 | 9.785 | 0.000 | 0.174 | 0.261 |
Austin, TX | 0.6732 | 0.057 | 11.771 | 0.000 | 0.561 | 0.785 |
Bakersfield, CA | 0.7093 | 0.084 | 8.399 | 0.000 | 0.543 | 0.875 |
Baltimore, MD | 0.7603 | 0.050 | 15.085 | 0.000 | 0.661 | 0.859 |
Boston, MA | 0.7380 | 0.039 | 19.061 | 0.000 | 0.662 | 0.814 |
Brownsville, TX | 0.0108 | 0.124 | 0.087 | 0.931 | -0.232 | 0.254 |
Buffalo, NY | 0.4417 | 0.126 | 3.506 | 0.000 | 0.194 | 0.689 |
Charlotte, NC | 0.4294 | 0.063 | 6.861 | 0.000 | 0.306 | 0.552 |
Chicago, IL | 0.0642 | 0.044 | 1.466 | 0.143 | -0.022 | 0.150 |
Cincinnati, OH | 0.3819 | 0.077 | 4.929 | 0.000 | 0.230 | 0.534 |
Cleveland, OH | 0.3281 | 0.075 | 4.367 | 0.000 | 0.181 | 0.476 |
Columbus, OH | 0.3106 | 0.061 | 5.132 | 0.000 | 0.192 | 0.429 |
Dallas-Fort Worth, TX | -0.1208 | 0.039 | -3.112 | 0.002 | -0.197 | -0.045 |
Denver, CO | 0.7589 | 0.049 | 15.439 | 0.000 | 0.662 | 0.855 |
Detroit, MI | -0.1144 | 0.041 | -2.773 | 0.006 | -0.195 | -0.033 |
El Paso, TX | 0.1030 | 0.065 | 1.591 | 0.112 | -0.024 | 0.230 |
Fort Myers, FL | 0.8002 | 0.090 | 8.903 | 0.000 | 0.624 | 0.977 |
Fresno, CA | 0.7314 | 0.072 | 10.104 | 0.000 | 0.589 | 0.873 |
Hartford, CT | 0.8312 | 0.069 | 12.015 | 0.000 | 0.695 | 0.967 |
Houston, TX | -0.1999 | 0.041 | -4.893 | 0.000 | -0.280 | -0.120 |
Indianapolis, IN | 0.2635 | 0.069 | 3.841 | 0.000 | 0.129 | 0.398 |
Jacksonville, FL | 0.5727 | 0.072 | 7.966 | 0.000 | 0.432 | 0.714 |
Kansas City, MO | 0.3907 | 0.069 | 5.623 | 0.000 | 0.254 | 0.527 |
Las Vegas, NV | 0.4618 | 0.047 | 9.836 | 0.000 | 0.370 | 0.554 |
Los Angeles-Long Beach-Anaheim, CA | 0.6058 | 0.087 | 7.001 | 0.000 | 0.436 | 0.776 |
Louisville-Jefferson County, KY | 0.3073 | 0.079 | 3.907 | 0.000 | 0.153 | 0.462 |
McAllen, TX | -0.1535 | 0.074 | -2.065 | 0.039 | -0.300 | -0.008 |
Miami-Fort Lauderdale, FL | -0.0743 | 0.077 | -0.967 | 0.334 | -0.225 | 0.077 |
Milwaukee, WI | 0.5530 | 0.081 | 6.791 | 0.000 | 0.393 | 0.713 |
Minneapolis-St Paul, MN | 0.4577 | 0.042 | 10.829 | 0.000 | 0.375 | 0.541 |
Naples, FL | 1.2890 | 0.094 | 13.661 | 0.000 | 1.104 | 1.474 |
Nashville, TN | 0.5698 | 0.066 | 8.615 | 0.000 | 0.440 | 0.700 |
New Haven, CT | 0.9442 | 0.124 | 7.628 | 0.000 | 0.701 | 1.187 |
New York, NY | 0.1580 | 0.174 | 0.910 | 0.363 | -0.183 | 0.499 |
Orlando, FL | 0.2801 | 0.043 | 6.485 | 0.000 | 0.195 | 0.365 |
Philadelphia, PA | 0.2834 | 0.038 | 7.449 | 0.000 | 0.209 | 0.358 |
Phoenix, AZ | 0.3258 | 0.041 | 7.957 | 0.000 | 0.245 | 0.406 |
Pittsburgh, PA | 0.2209 | 0.103 | 2.136 | 0.033 | 0.018 | 0.424 |
Portland, OR | 0.8347 | 0.052 | 15.933 | 0.000 | 0.732 | 0.938 |
Providence, RI | 0.8958 | 0.063 | 14.201 | 0.000 | 0.772 | 1.020 |
Raleigh, NC | 0.7524 | 0.070 | 10.786 | 0.000 | 0.615 | 0.889 |
Riverside, CA | 0.5796 | 0.039 | 14.958 | 0.000 | 0.504 | 0.656 |
Sacramento, CA | 0.8478 | 0.046 | 18.417 | 0.000 | 0.757 | 0.938 |
Salinas, CA | 1.6359 | 0.128 | 12.772 | 0.000 | 1.384 | 1.887 |
Salt Lake City, UT | 0.8850 | 0.071 | 12.486 | 0.000 | 0.746 | 1.024 |
San Antonio, TX | 0.2820 | 0.062 | 4.584 | 0.000 | 0.161 | 0.403 |
San Diego, CA | 1.0311 | 0.038 | 27.413 | 0.000 | 0.957 | 1.105 |
San Francisco, CA | 1.2329 | 0.052 | 23.512 | 0.000 | 1.130 | 1.336 |
San Jose, CA | 1.5138 | 0.039 | 38.767 | 0.000 | 1.437 | 1.591 |
Seattle, WA | 0.7473 | 0.038 | 19.578 | 0.000 | 0.672 | 0.822 |
St. Louis, MO | 0.4985 | 0.090 | 5.538 | 0.000 | 0.322 | 0.675 |
Stamford, CT | 1.3636 | 0.063 | 21.659 | 0.000 | 1.240 | 1.487 |
Stockton, CA | 0.9600 | 0.075 | 12.776 | 0.000 | 0.812 | 1.108 |
Tampa, FL | 0.1914 | 0.045 | 4.277 | 0.000 | 0.104 | 0.279 |
Tucson, AZ | 0.6477 | 0.083 | 7.781 | 0.000 | 0.484 | 0.811 |
Urban Honolulu, HI | 1.6076 | 0.063 | 25.522 | 0.000 | 1.484 | 1.731 |
Ventura, CA | 1.6128 | 0.075 | 21.536 | 0.000 | 1.466 | 1.760 |
Virginia Beach, VA | 0.8906 | 0.094 | 9.437 | 0.000 | 0.705 | 1.076 |
Washington, DC | 0.5938 | 0.045 | 13.263 | 0.000 | 0.506 | 0.682 |
Worcester, MA | 0.9421 | 0.076 | 12.442 | 0.000 | 0.793 | 1.091 |
Notice the coefficient for China is positive but not significantly different from 0. This means a Chinese immigrant is not significantly different from an average immigrant, in terms of correlation with (log) housing prices.
Also, the coefficient for log_total is significantly positive, as is in our first regression model.
Hence, the data suggests that Chinese immigrants does push up housing prices, but not in a way that is different from immigrants from other countries.
(See Appendix 2 for Lasso regression)
To investigate this question, I appended to each immigration entry 5 prior years of housing data. I used a linear regression model to explain log total PR admission by prior years housing prices.
def regress_pr_on_housing(offset:int, year_controls=False, region_controls=True):
housing_data = log_housing.T
PR_data = df2.reset_index().loc[:,['RegionName','year','Total','log_total']]
# create offset terms
year_columns = []
for i in range(0,offset+1):
shift = housing_data.shift(i).stack().reset_index().rename(columns={0:f'{i} year prior'})
PR_data = PR_data.merge(shift, on=['RegionName','year'])
year_columns.append(f'{i} year prior')
# Prepare X variable
X = PR_data.loc[:,year_columns]
if year_controls:
year_dummies = pd.get_dummies(PR_data['year'], drop_first=True)
X = X.join(year_dummies)
if region_controls:
region_dummies = pd.get_dummies(PR_data['RegionName'], drop_first=True)
X = X.join(region_dummies)
X = add_constant(X)
# y variable
y = PR_data['log_total']
# regressions
ols = sm.OLS(y, X)
ols_result = ols.fit()
return ols_result
a = regress_pr_on_housing(5)
a.summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 5.3528 | 0.590 | 9.065 | 0.000 | 4.193 | 6.512 |
0 year prior | 0.2548 | 0.089 | 2.850 | 0.005 | 0.079 | 0.430 |
1 year prior | 0.0799 | 0.176 | 0.453 | 0.651 | -0.267 | 0.426 |
2 year prior | 0.0378 | 0.208 | 0.182 | 0.856 | -0.371 | 0.447 |
3 year prior | 0.0574 | 0.211 | 0.272 | 0.785 | -0.356 | 0.471 |
4 year prior | 0.0244 | 0.180 | 0.136 | 0.892 | -0.329 | 0.378 |
5 year prior | -0.0719 | 0.084 | -0.855 | 0.393 | -0.237 | 0.093 |
Austin, TX | -1.3677 | 0.045 | -30.160 | 0.000 | -1.457 | -1.279 |
Bakersfield, CA | -1.9762 | 0.062 | -31.750 | 0.000 | -2.098 | -1.854 |
Baltimore, MD | -1.1634 | 0.048 | -24.148 | 0.000 | -1.258 | -1.069 |
Boston, MA | -0.1219 | 0.058 | -2.119 | 0.034 | -0.235 | -0.009 |
Brownsville, TX | -1.6753 | 0.127 | -13.202 | 0.000 | -1.924 | -1.426 |
Buffalo, NY | -1.8368 | 0.124 | -14.781 | 0.000 | -2.081 | -1.593 |
Charlotte, NC | -1.4489 | 0.044 | -33.258 | 0.000 | -1.534 | -1.363 |
Chicago, IL | 0.4955 | 0.045 | 10.967 | 0.000 | 0.407 | 0.584 |
Cincinnati, OH | -1.8073 | 0.049 | -36.743 | 0.000 | -1.904 | -1.711 |
Cleveland, OH | -1.7860 | 0.045 | -39.828 | 0.000 | -1.874 | -1.698 |
Columbus, OH | -1.3398 | 0.044 | -30.523 | 0.000 | -1.426 | -1.254 |
Dallas-Fort Worth, TX | 0.2981 | 0.044 | 6.817 | 0.000 | 0.212 | 0.384 |
Denver, CO | -1.1110 | 0.049 | -22.888 | 0.000 | -1.206 | -1.016 |
Detroit, MI | -0.4029 | 0.045 | -8.928 | 0.000 | -0.491 | -0.314 |
El Paso, TX | -1.3904 | 0.047 | -29.330 | 0.000 | -1.483 | -1.297 |
Fort Myers, FL | -2.0547 | 0.068 | -30.207 | 0.000 | -2.188 | -1.921 |
Fresno, CA | -1.8587 | 0.044 | -42.108 | 0.000 | -1.945 | -1.772 |
Hartford, CT | -1.8067 | 0.046 | -39.178 | 0.000 | -1.897 | -1.716 |
Houston, TX | 0.4680 | 0.044 | 10.670 | 0.000 | 0.382 | 0.554 |
Indianapolis, IN | -1.5028 | 0.049 | -30.781 | 0.000 | -1.599 | -1.407 |
Jacksonville, FL | -1.7837 | 0.044 | -40.938 | 0.000 | -1.869 | -1.698 |
Kansas City, MO | -1.6451 | 0.044 | -37.315 | 0.000 | -1.732 | -1.558 |
Las Vegas, NV | -0.9082 | 0.045 | -20.310 | 0.000 | -0.996 | -0.820 |
Los Angeles-Long Beach-Anaheim, CA | 1.0310 | 0.066 | 15.705 | 0.000 | 0.902 | 1.160 |
Louisville-Jefferson County, KY | -1.6634 | 0.058 | -28.564 | 0.000 | -1.778 | -1.549 |
McAllen, TX | -1.6134 | 0.061 | -26.507 | 0.000 | -1.733 | -1.494 |
Miami-Fort Lauderdale, FL | 1.1634 | 0.045 | 25.829 | 0.000 | 1.075 | 1.252 |
Milwaukee, WI | -1.9395 | 0.052 | -37.104 | 0.000 | -2.042 | -1.837 |
Minneapolis-St Paul, MN | -0.6825 | 0.045 | -15.018 | 0.000 | -0.772 | -0.593 |
Naples, FL | -2.2516 | 0.081 | -27.738 | 0.000 | -2.411 | -2.092 |
Nashville, TN | -1.5928 | 0.044 | -35.817 | 0.000 | -1.680 | -1.505 |
New Haven, CT | -1.8936 | 0.124 | -15.221 | 0.000 | -2.138 | -1.649 |
New York, NY | 1.7793 | 0.060 | 29.580 | 0.000 | 1.661 | 1.897 |
Orlando, FL | -0.6609 | 0.044 | -15.137 | 0.000 | -0.747 | -0.575 |
Philadelphia, PA | -0.2284 | 0.045 | -5.099 | 0.000 | -0.316 | -0.140 |
Phoenix, AZ | -0.5509 | 0.044 | -12.473 | 0.000 | -0.638 | -0.464 |
Pittsburgh, PA | -1.8373 | 0.092 | -19.919 | 0.000 | -2.018 | -1.656 |
Portland, OR | -1.2623 | 0.049 | -25.807 | 0.000 | -1.358 | -1.166 |
Providence, RI | -1.6467 | 0.048 | -34.225 | 0.000 | -1.741 | -1.552 |
Raleigh, NC | -1.7909 | 0.045 | -39.881 | 0.000 | -1.879 | -1.703 |
Riverside, CA | -0.4797 | 0.048 | -9.931 | 0.000 | -0.575 | -0.385 |
Sacramento, CA | -1.0180 | 0.052 | -19.712 | 0.000 | -1.119 | -0.917 |
Salinas, CA | -2.4987 | 0.129 | -19.413 | 0.000 | -2.751 | -2.246 |
Salt Lake City, UT | -1.8718 | 0.046 | -40.352 | 0.000 | -1.963 | -1.781 |
San Antonio, TX | -1.3575 | 0.044 | -30.508 | 0.000 | -1.445 | -1.270 |
San Diego, CA | -0.4958 | 0.063 | -7.821 | 0.000 | -0.620 | -0.371 |
San Francisco, CA | -0.0758 | 0.079 | -0.963 | 0.336 | -0.230 | 0.079 |
San Jose, CA | -0.6748 | 0.082 | -8.246 | 0.000 | -0.836 | -0.514 |
Seattle, WA | -0.4292 | 0.054 | -7.940 | 0.000 | -0.535 | -0.323 |
St. Louis, MO | -1.7643 | 0.076 | -23.300 | 0.000 | -1.913 | -1.616 |
Stamford, CT | -1.8226 | 0.062 | -29.289 | 0.000 | -1.945 | -1.700 |
Stockton, CA | -2.0155 | 0.047 | -42.569 | 0.000 | -2.108 | -1.923 |
Tampa, FL | -0.7035 | 0.044 | -16.106 | 0.000 | -0.789 | -0.618 |
Tucson, AZ | -1.8839 | 0.062 | -30.452 | 0.000 | -2.005 | -1.762 |
Urban Honolulu, HI | -1.9148 | 0.068 | -28.049 | 0.000 | -2.049 | -1.781 |
Ventura, CA | -2.2068 | 0.069 | -32.107 | 0.000 | -2.342 | -2.072 |
Virginia Beach, VA | -2.1269 | 0.076 | -27.860 | 0.000 | -2.277 | -1.977 |
Washington, DC | 0.3311 | 0.056 | 5.877 | 0.000 | 0.220 | 0.442 |
Worcester, MA | -2.0258 | 0.048 | -42.561 | 0.000 | -2.119 | -1.932 |
Note the coefficients of the prior years are all insignificant. Thus, the data does not suggest prior years housing prices to be linearly correlated with immigration. A reason might be that immigration is a lengthy process. If I add in more prior years:
regress_pr_on_housing(10).summary().tables[1]
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 6.5531 | 0.743 | 8.818 | 0.000 | 5.093 | 8.013 |
0 year prior | 0.2803 | 0.090 | 3.114 | 0.002 | 0.104 | 0.457 |
1 year prior | 0.0170 | 0.164 | 0.104 | 0.917 | -0.305 | 0.339 |
2 year prior | 0.0764 | 0.185 | 0.414 | 0.679 | -0.286 | 0.439 |
3 year prior | -0.0648 | 0.194 | -0.334 | 0.739 | -0.446 | 0.317 |
4 year prior | 0.1592 | 0.199 | 0.799 | 0.424 | -0.232 | 0.551 |
5 year prior | -0.2943 | 0.196 | -1.501 | 0.134 | -0.680 | 0.091 |
6 year prior | 0.5151 | 0.198 | 2.604 | 0.009 | 0.127 | 0.904 |
7 year prior | -0.5846 | 0.197 | -2.960 | 0.003 | -0.973 | -0.197 |
8 year prior | 0.0490 | 0.200 | 0.245 | 0.806 | -0.343 | 0.441 |
9 year prior | 0.2529 | 0.182 | 1.389 | 0.165 | -0.105 | 0.610 |
10 year prior | -0.1210 | 0.088 | -1.382 | 0.168 | -0.293 | 0.051 |
Austin, TX | -1.3508 | 0.042 | -31.990 | 0.000 | -1.434 | -1.268 |
Bakersfield, CA | -2.0367 | 0.058 | -35.066 | 0.000 | -2.151 | -1.923 |
Baltimore, MD | -1.1282 | 0.048 | -23.629 | 0.000 | -1.222 | -1.034 |
Boston, MA | -0.0714 | 0.062 | -1.152 | 0.250 | -0.193 | 0.050 |
Buffalo, NY | -1.9044 | 0.105 | -18.130 | 0.000 | -2.111 | -1.698 |
Charlotte, NC | -1.4279 | 0.039 | -36.677 | 0.000 | -1.504 | -1.351 |
Chicago, IL | 0.4679 | 0.042 | 11.146 | 0.000 | 0.385 | 0.550 |
Cincinnati, OH | -1.8453 | 0.044 | -41.980 | 0.000 | -1.932 | -1.759 |
Cleveland, OH | -1.8536 | 0.041 | -45.052 | 0.000 | -1.934 | -1.773 |
Columbus, OH | -1.3470 | 0.039 | -34.125 | 0.000 | -1.425 | -1.269 |
Dallas-Fort Worth, TX | 0.2575 | 0.039 | 6.556 | 0.000 | 0.180 | 0.335 |
Denver, CO | -1.0898 | 0.048 | -22.891 | 0.000 | -1.183 | -0.996 |
Detroit, MI | -0.4524 | 0.042 | -10.760 | 0.000 | -0.535 | -0.370 |
El Paso, TX | -1.4454 | 0.045 | -32.102 | 0.000 | -1.534 | -1.357 |
Fort Myers, FL | -2.0879 | 0.058 | -35.732 | 0.000 | -2.203 | -1.973 |
Fresno, CA | -1.9046 | 0.040 | -47.476 | 0.000 | -1.983 | -1.826 |
Hartford, CT | -1.8222 | 0.044 | -41.500 | 0.000 | -1.908 | -1.736 |
Houston, TX | 0.4104 | 0.039 | 10.405 | 0.000 | 0.333 | 0.488 |
Indianapolis, IN | -1.5327 | 0.044 | -34.490 | 0.000 | -1.620 | -1.445 |
Jacksonville, FL | -1.8126 | 0.039 | -46.467 | 0.000 | -1.889 | -1.736 |
Kansas City, MO | -1.6779 | 0.040 | -42.178 | 0.000 | -1.756 | -1.600 |
Las Vegas, NV | -0.8673 | 0.041 | -21.252 | 0.000 | -0.948 | -0.787 |
Los Angeles-Long Beach-Anaheim, CA | 1.0689 | 0.074 | 14.407 | 0.000 | 0.923 | 1.215 |
Louisville-Jefferson County, KY | -1.7119 | 0.050 | -34.076 | 0.000 | -1.811 | -1.613 |
McAllen, TX | -1.7529 | 0.066 | -26.376 | 0.000 | -1.883 | -1.622 |
Miami-Fort Lauderdale, FL | 1.1827 | 0.042 | 28.254 | 0.000 | 1.100 | 1.265 |
Milwaukee, WI | -1.9825 | 0.047 | -42.574 | 0.000 | -2.074 | -1.891 |
Minneapolis-St Paul, MN | -0.6750 | 0.042 | -15.906 | 0.000 | -0.758 | -0.592 |
Naples, FL | -2.2398 | 0.073 | -30.610 | 0.000 | -2.384 | -2.096 |
Nashville, TN | -1.6160 | 0.039 | -41.188 | 0.000 | -1.693 | -1.539 |
New Haven, CT | -1.9124 | 0.105 | -18.256 | 0.000 | -2.118 | -1.707 |
New York, NY | 1.8479 | 0.066 | 27.825 | 0.000 | 1.717 | 1.978 |
Orlando, FL | -0.6312 | 0.039 | -16.120 | 0.000 | -0.708 | -0.554 |
Philadelphia, PA | -0.2425 | 0.042 | -5.822 | 0.000 | -0.324 | -0.161 |
Phoenix, AZ | -0.5676 | 0.040 | -14.176 | 0.000 | -0.646 | -0.489 |
Pittsburgh, PA | -2.0116 | 0.105 | -19.097 | 0.000 | -2.219 | -1.805 |
Portland, OR | -1.2819 | 0.049 | -26.314 | 0.000 | -1.378 | -1.186 |
Providence, RI | -1.6302 | 0.047 | -34.472 | 0.000 | -1.723 | -1.537 |
Raleigh, NC | -1.7712 | 0.041 | -42.700 | 0.000 | -1.853 | -1.690 |
Riverside, CA | -0.4749 | 0.047 | -10.024 | 0.000 | -0.568 | -0.382 |
Sacramento, CA | -0.9991 | 0.053 | -18.999 | 0.000 | -1.102 | -0.896 |
Salt Lake City, UT | -1.8495 | 0.044 | -41.871 | 0.000 | -1.936 | -1.763 |
San Antonio, TX | -1.3835 | 0.041 | -34.160 | 0.000 | -1.463 | -1.304 |
San Diego, CA | -0.4365 | 0.070 | -6.194 | 0.000 | -0.575 | -0.298 |
San Francisco, CA | 0.0072 | 0.092 | 0.078 | 0.937 | -0.174 | 0.188 |
San Jose, CA | -0.5956 | 0.096 | -6.174 | 0.000 | -0.785 | -0.406 |
Seattle, WA | -0.3906 | 0.057 | -6.880 | 0.000 | -0.502 | -0.279 |
St. Louis, MO | -1.8030 | 0.064 | -28.219 | 0.000 | -1.928 | -1.677 |
Stamford, CT | -1.7747 | 0.069 | -25.674 | 0.000 | -1.911 | -1.639 |
Stockton, CA | -2.0160 | 0.045 | -44.872 | 0.000 | -2.104 | -1.928 |
Tampa, FL | -0.7210 | 0.039 | -18.446 | 0.000 | -0.798 | -0.644 |
Tucson, AZ | -1.9411 | 0.057 | -33.766 | 0.000 | -2.054 | -1.828 |
Urban Honolulu, HI | -1.8671 | 0.078 | -23.802 | 0.000 | -2.021 | -1.713 |
Ventura, CA | -2.1749 | 0.077 | -28.108 | 0.000 | -2.327 | -2.023 |
Virginia Beach, VA | -2.1460 | 0.065 | -32.796 | 0.000 | -2.275 | -2.017 |
Washington, DC | 0.3876 | 0.061 | 6.399 | 0.000 | 0.269 | 0.507 |
Worcester, MA | -2.0109 | 0.046 | -44.015 | 0.000 | -2.101 | -1.921 |
The coefficient of the 6th prior year is significantly positive and that of the 7th prior year is significantly negative! There might be something going on. Maybe it takes 7 years for a person that wants to immigrate to US because of low housing prices to immigrate.
graph_pr(2018, "Canada")
graph_pr(2018, "Mexico")
graph_pr(2018, "France")
In this section, I use Lasso regression to see which coefficients are less important.
def regress2(*countries_of_interest, year_controls=True, region_controls=True):
'''This function uses sklearn'''
countries_of_interest=list(countries_of_interest)
data = df2.reset_index()
#countries = data.drop(columns=['log_price', 'Total','cbsacode','year', 'RegionName', 'log_total']).columns
# Prepare X variable
X = data.loc[:,countries_of_interest]
X['Other'] = data['Total'] - data[countries_of_interest].sum(axis=1)
X['log_total'] = data['log_total']
if year_controls:
year_dummies = pd.get_dummies(data['year'], drop_first=True)
X = X.join(year_dummies)
if region_controls:
region_dummies = pd.get_dummies(data['RegionName'], drop_first=True)
X = X.join(region_dummies)
# y variable
y = data['log_price']
# regressions
lr_model = linear_model.LinearRegression()
lr_model.fit(X, y)
lasso_model = linear_model.Lasso(tol=1e-3, alpha=1e-3)
lasso_model.fit(X, y)
lr_coefs = pd.Series(dict(zip(list(X), lr_model.coef_)))
lasso_coefs= pd.Series(dict(zip(list(X), lasso_model.coef_)))
coefs = pd.DataFrame(dict(lasso=lasso_coefs, linreg=lr_coefs))
print('Linear R2:', lr_model.score(X,y))
print('Lasso R2:', lasso_model.score(X,y))
scatter_plot(data, X, lr_model, lasso_model)
return coefs
coefs = regress2('Mexico',"China, People's Republic", "Korea, South")
Linear R2: 0.9610738048895404 Lasso R2: 0.9291065830902937
The Lasso regression still fits the data pretty well. Let's see which coefficients are suppressed by Lasso regulation:
coefs.query('lasso==0')
lasso | linreg | |
---|---|---|
2014-12-31 00:00:00 | -0.0 | 0.017721 |
Bakersfield, CA | 0.0 | 0.692628 |
Brownsville, TX | -0.0 | -0.006739 |
Buffalo, NY | -0.0 | 0.427049 |
Fort Myers, FL | 0.0 | 0.785983 |
Jacksonville, FL | -0.0 | 0.558148 |
Las Vegas, NV | 0.0 | 0.445833 |
Miami-Fort Lauderdale, FL | 0.0 | -0.118677 |
Nashville, TN | -0.0 | 0.555896 |
New Haven, CT | 0.0 | 0.930427 |
New York, NY | -0.0 | 0.155687 |
Philadelphia, PA | -0.0 | 0.274709 |
St. Louis, MO | -0.0 | 0.484690 |
Tucson, AZ | -0.0 | 0.631371 |
Virginia Beach, VA | 0.0 | 0.876710 |