A common claim made by many is that immigration pushes up housing prices. Moreover, sometimes these claims are targeted to immigrants with a specific origin. An example is that Chinese immigrants causes housing prices to rise.
In this project, I use annual country-specific Lawful Permanent Residents (PR) data to the United States and housing data to find evidence for/against these claims. I target to answer 2 questions:
I use linear regression models to estimate correlations between appropriate variables. My findings are:
As a bonus thing that I got from the data, I estimated correlations of PR admission with prior years of housing prices. Interstingly, PR admission is positively correlated with housing prices 6 years ago and negatively correlated with housing prices 7 years ago.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import qeds
import geopandas as gpd
from pathlib import Path
folder = Path(r"C:\Users\YuchengLiu\Documents\323")
pd.__version__
import json
from bokeh.plotting import figure
from bokeh.io import output_notebook, show, output_file
from bokeh.models import GeoJSONDataSource, LinearColorMapper, LogColorMapper
from bokeh.models import ColorBar, HoverTool, LogTicker
from bokeh.palettes import brewer
output_notebook()
import seaborn as sns
import sklearn
from sklearn import linear_model
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
The housing data is monthly. I turned it to yearly data by averaging.
years = range(2004,2019)
# import housing file
housing_file = folder / 'housing_by_metro.csv'
housing = pd.read_csv(housing_file)
metro_housing = housing['RegionName']
housing=housing.set_index('RegionName').drop(columns=['RegionID','SizeRank'])
# take year average
housing.columns=pd.to_datetime(housing.columns)
housing=housing.T.reset_index().groupby(pd.Grouper(key='index',freq='Y')).mean().T
housing.columns.name = 'year'
log_housing = housing.apply(np.log)
#housing.head()
def get_metro_map(dfa,metro_housing):
'''search the housing file to match metro names'''
metro_pr = dfa.index
d = {}
for metro in metro_pr:
if metro == 'Total':
d[metro]='United States'
continue
if (metro_housing==metro).sum() != 0:
#print(metro)
#print((metro_housing==metro).sum())
d[metro]=metro
else:
x = set()
end = metro.find(',')
cities = metro[:end].split('-')
states = metro[end+1:].strip().split('-')
#print(cities)
#print(states)
#print()
for city in cities:
for state in states:
if city != '':
#print(city)
a = metro_housing.str.contains(city)
b = metro_housing.str.contains(state)
s = a&b
values = metro_housing[s[s].index].values
#print(values)
x = x.union(metro_housing[s[s].index].values)
if x:
#print(len(x), end=' ')
#if len(x)!=1:
#print(metro, x)
d[metro] = x.pop() # empirically, unique
d['Louisville/Jefferson County, KY-IN']='Louisville-Jefferson County, KY'
return d
def load_code_file():
df = pd.read_csv(folder / 'cbsa_to_county.csv')
df = df[['cbsacode','cbsatitle']][1:].drop_duplicates()
df['cbsacode']=df['cbsacode'].astype(int)
df = df.set_index('cbsatitle')
mapping = get_metro_map(df,metro_housing)
df.index = df.index.map(mapping)
return df
code_df=load_code_file()
#code_df
The PR data has a file for each year. I cleaned them individually and append them together.
# import PR file
def get_year(year, matched_names, common_names, country_level=False):
'''load and process PR file for a single year'''
pr_folder_name = folder / 'pr_tables'
for file in pr_folder_name.iterdir():
#print(file.name)
if(file.name.startswith(str(year))):
break
# extract the data
header = 3 if year > 2012 else 4
df = pd.read_excel(file, header=header)
dfa= df.loc[10:] if country_level else df.loc[1:8]
# remove footnotes
dfa=dfa.set_index('Region and country of birth')
last=dfa.index.get_loc('Unknown')
dfa=dfa[:last].T
# match metro names with housing data
mapping = get_metro_map(dfa,metro_housing)
matched_names = matched_names.union(mapping.values())
common_names = common_names.intersection(mapping.values())
dfa.index = dfa.index.map(mapping)
dfa=dfa[dfa.index.notnull()]
dfa=dfa.replace({'D':0,'-':0}).astype(float)
# merge with cbsa code
dfa = (dfa.reset_index()
.merge(code_df.reset_index(),left_on='index',right_on='cbsatitle',how='left')
.set_index('index')
.drop('cbsatitle',axis=1))
dfa.loc['United States','cbsacode']=0 # put 0 for U.S.
# add year
dfa['year']=pd.to_datetime(f"{year}-12-31")
# fix some names
dfa=dfa.rename(columns={'Korea':'Korea, South', 'Antigua-Barbuda':'Antigua and Barbuda'})
return dfa, matched_names, common_names
# append all the years together
pr = pd.DataFrame()
matched_names = set()
common_names = set(metro_housing)
for year in range(2004,2019):
print('-',end='')
dta, matched_names, common_names = get_year(year, matched_names, common_names, True)
pr = pr.append(dta)
pr = pr.dropna(axis=1)
print('Done loading PR files!')
These states and cbsa's dataframs are for mapping.
def load_state_df():
df = gpd.read_file('https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_5m.zip')
df['cbsacode'] = 0.0
df.set_index('STATEFP')
return df[['NAME','cbsacode','geometry']]
state_df=load_state_df()
#state_df
def load_cbsa_df():
df = gpd.read_file("https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_cbsa_5m.zip")
df['cbsacode'] = df['CBSAFP'].astype(float)
return df[['cbsacode','geometry']]
cbsa_df=load_cbsa_df()
#cbsa_df
First, I give a visualization of the Permanent Residence data.
def graph_pr_bokeh(df1,df2, year, paint_according_to='Total'):
#paint_according_to='Japan'
geojson1=GeoJSONDataSource(geojson=df1.to_json())
geojson2=GeoJSONDataSource(geojson=df2.to_json())
if paint_according_to in {'Total','China','Mexico','Russia'}:
color_mapper = LogColorMapper(palette = brewer['YlOrRd'][9][::-1])
else:
color_mapper = LinearColorMapper(palette = brewer['YlOrRd'][9][::-1])
color_bar = ColorBar(color_mapper=color_mapper, ticker=LogTicker(),
label_standoff=12, height = 20,
border_line_color=None, location = (0,0), orientation = 'horizontal')
title=f"Persons Obtaining PR Status by Region and Country of Birth in {year}"
if paint_according_to != 'Total':
title += f', colored by persons from {paint_according_to}'
p = figure(title=title, plot_width=750, plot_height=500, tools="pan,box_zoom,reset,previewsave")
hover_off=p.patches("xs","ys",source=geojson1, fill_color='white', line_color='grey')
hover_on =p.patches("xs","ys",source=geojson2, line_color='black',
fill_color = {'field' :paint_according_to, 'transform' : color_mapper})
tooltips = [('Region','@index'), ('Total' ,'@Total'), ('Canada','@Canada'),
('China' ,'@China'), ('Mexico','@Mexico'),]
temp = (paint_according_to, '@'+paint_according_to)
if temp not in tooltips:
tooltips.append(temp)
hover = HoverTool(tooltips = tooltips, renderers=[hover_on])
p.add_tools(hover)
#p.add_layout(color_bar, 'below')
show(p)
def graph_pr(year=2018, paint_according_to='Total', bokeh=True):
year2 = f"{year}-12-31"
data = (pr[pr['year']==year2]
.drop(index=['Urban Honolulu, HI'])
.reset_index()
.drop(columns='year')
.rename(columns={"China, People's Republic":'China'}))
# First draw US, then draw the metros
'''{
27:Hawaii,
39:US Virgin Islands,
47:Guam,
48:Commonwealth of the Northern Mariana Islands,
49:American Samoa,
51:Puerto Rico,
55:Alaska}'''
df1 = state_df.drop(index=[27,39,47,48,49,51,55]).merge(data, on='cbsacode', how='left')
df2 = cbsa_df.merge(data,on='cbsacode', how='inner')
if bokeh:
graph_pr_bokeh(df1, df2, year, paint_according_to)
else:
fig, gax = plt.subplots(figsize=(15, 10))
df1.plot(ax=gax, alpha=0.3)
df2.plot(ax=gax, edgecolor="black", color="white")
#return df1,df2
graph_pr(2018)
graph_pr(2018, 'China')