Data Exploration: Migration in Europe

In [1]:
import numpy as np
import pandas as pd
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.patches as pat
from matplotlib import cm

Immigration & emigration overview

In [2]:
# TOTAL IMMIGRATION DATA
# https://ec.europa.eu/eurostat/cache/metadata/en/migr_immi_esms.htm
df_imm_total = pd.read_excel('migr_imm2ctz.xlsx', sheet_name='total')
df_imm_total
Out[2]:
GEO/TIME 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
0 Belgium : : 135281 147377 129477 120078 123158 146626 123702 126703
1 Bulgaria : : : : 14103 18570 26615 25223 21241 25597
2 Czechia 108267 75620 48317 27114 34337 30124 29897 29602 64083 51847
3 Denmark 57357 51800 52236 52833 54409 60312 68388 78492 74383 68579
4 Germany (until 1990 former territory of the FRG) 682146 346216 404055 489422 592175 692713 884893 1543848 1029852 917109
5 Estonia 3671 3884 2810 3709 2639 4109 3904 15413 14822 17616
6 Ireland 82592 50604 52339 57292 61324 65539 73519 80792 85185 78499
7 Greece 66529 58613 60462 60089 58200 57946 59013 64446 116867 112247
8 Spain 599075 392962 360705 371331 304053 280772 305454 342114 414746 532132
9 France 296608 296970 307111 319816 327431 338752 340383 364221 378115 369964
10 Croatia 16883 13213 8846 8534 8959 10378 10638 11706 13985 15553
11 Italy 534712 442940 458856 385793 350772 307454 277631 280078 300823 343440
12 Cyprus 21060 22581 20206 23037 17476 13149 9212 15183 17391 21306
13 Latvia 4678 3731 4011 10234 13303 8299 10365 9479 8345 9916
14 Lithuania 9297 6487 5213 15685 19843 22011 24294 22130 20162 20368
15 Luxembourg 17758 15751 16962 20268 20478 21098 22332 23803 22888 24379
16 Hungary 37652 27894 25519 28018 33702 38968 54581 58344 53618 68070
17 Malta 6043 6161 4275 5465 8256 10897 14454 16936 17051 21676
18 Netherlands 143516 122917 126776 130118 124566 129428 145323 166872 189232 189646
19 Austria 73772 69295 70978 82230 91557 101866 116262 166323 129509 111801
20 Poland 15275 189166 155131 157059 217546 220311 222275 218147 208302 209353
21 Portugal 29718 32307 27575 19667 14606 17554 19516 29896 29925 36639
22 Romania 138929 135844 149885 147685 167266 153646 136035 132795 137455 177435
23 Slovenia 30693 30296 15416 14083 15022 13871 13846 15420 16623 18808
24 Slovakia 8765 6346 5272 4829 5419 5149 5357 6997 7686 7188
25 Finland 29114 26699 25636 29481 31278 31941 31507 28746 34905 31797
26 Sweden 101171 102280 98801 96467 103059 115845 126966 134240 163005 144489
27 United Kingdom 590242 566514 590950 566044 498040 526046 631991 631452 588993 644209
28 Iceland 10288 3921 3948 4073 4960 6406 5368 5635 8710 12116
29 Liechtenstein 578 584 591 650 671 696 615 657 607 645
30 Norway 58123 55953 69214 70337 69908 68313 66903 60816 61460 53351
31 Switzerland 184297 160623 161778 148799 149051 160157 156282 153627 149305 143377
32 North Macedonia : : : : : : : : : :
33 Turkey : : : : : : : : : :
34 Belarus : : : : : : : : : :
35 Bosnia and Herzegovina : : : : : : : : : :
36 Moldova : : : : : : : : : :
37 Russia : : : : : : : : : :
38 San Marino : : : : : : : : : :
39 United States : : : : : : : : : :
40 Kyrgyzstan : : : : : : : : : :
41 Tajikistan : : : : : : : : : :
42 Uzbekistan : : : : : : : : : :
43 Armenia : : : : : : : : : :
44 Azerbaijan : : : : : : : : : :
In [3]:
df_imm_total.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 11 columns):
GEO/TIME    45 non-null object
2008        45 non-null object
2009        45 non-null object
2010        45 non-null object
2011        45 non-null object
2012        45 non-null object
2013        45 non-null object
2014        45 non-null object
2015        45 non-null object
2016        45 non-null object
2017        45 non-null object
dtypes: object(11)
memory usage: 4.0+ KB
In [4]:
df_imm_total.keys()[0]
Out[4]:
'GEO/TIME'
In [5]:
# Set index
df_imm_total.set_index('GEO/TIME', inplace=True)

# Replace colons with missing value NaNs and set datatype
df_imm_total = df_imm_total.apply(pd.to_numeric, errors='coerce')

# Drop rows with no data
df_imm_total.dropna(how='all', inplace=True)

# Explore remaining missing values
df_imm_total[df_imm_total.isnull().sum(axis=1) > 0]
Out[5]:
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
GEO/TIME
Belgium NaN NaN 135281.0 147377.0 129477.0 120078.0 123158.0 146626.0 123702.0 126703.0
Bulgaria NaN NaN NaN NaN 14103.0 18570.0 26615.0 25223.0 21241.0 25597.0
In [6]:
# Backfill missing values
df_imm_total.bfill(axis=1, inplace=True)

# Rename Germany
df_imm_total.index = [s.replace('Germany (until 1990 former territory of the FRG)', 'Germany') for s in df_imm_total.index]

# Strip column names
df_imm_total.columns = [col.strip() for col in df_imm_total.columns]

df_imm_total.head()
Out[6]:
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
Belgium 135281.0 135281.0 135281.0 147377.0 129477.0 120078.0 123158.0 146626.0 123702.0 126703.0
Bulgaria 14103.0 14103.0 14103.0 14103.0 14103.0 18570.0 26615.0 25223.0 21241.0 25597.0
Czechia 108267.0 75620.0 48317.0 27114.0 34337.0 30124.0 29897.0 29602.0 64083.0 51847.0
Denmark 57357.0 51800.0 52236.0 52833.0 54409.0 60312.0 68388.0 78492.0 74383.0 68579.0
Germany 682146.0 346216.0 404055.0 489422.0 592175.0 692713.0 884893.0 1543848.0 1029852.0 917109.0
In [7]:
# Eurostat dataset cleaning function
countries = df_imm_total.index.tolist()

def clean_eurostat_excel(file_name, sheet):
    data = pd.read_excel(file_name, sheet_name = sheet, index_col=0)

    # rename Germany
    data.index = [str(s).replace('Germany (until 1990 former territory of the FRG)', 'Germany') for s in data.index]

    # replace ':' with NaNs, set datatype, and impute missing data with backfill/frontfill
    data = data.apply(pd.to_numeric, errors='coerce').bfill(axis=1).ffill(axis=1)
    
    # drop rows with no data
    data.dropna(how='all', inplace=True)

    # strip column names of extraneous spaces
    data.columns = [col.strip() for col in data.columns]
    
    # reduce rows to country list
    data = data.loc[countries]

    return data;
In [8]:
df_imm_total = clean_eurostat_excel('migr_imm2ctz.xlsx', 'total')

df_imm_total.head()
Out[8]:
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
Belgium 135281.0 135281.0 135281.0 147377.0 129477.0 120078.0 123158.0 146626.0 123702.0 126703.0
Bulgaria 14103.0 14103.0 14103.0 14103.0 14103.0 18570.0 26615.0 25223.0 21241.0 25597.0
Czechia 108267.0 75620.0 48317.0 27114.0 34337.0 30124.0 29897.0 29602.0 64083.0 51847.0
Denmark 57357.0 51800.0 52236.0 52833.0 54409.0 60312.0 68388.0 78492.0 74383.0 68579.0
Germany 682146.0 346216.0 404055.0 489422.0 592175.0 692713.0 884893.0 1543848.0 1029852.0 917109.0
In [9]:
# TOTAL EMIGRATION DATA
# https://ec.europa.eu/eurostat/cache/metadata/en/migr_immi_esms.htm
df_emi_total = clean_eurostat_excel('migr_emi1ctz.xlsx', 'total')

df_emi_total.head()
Out[9]:
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
Belgium 66013.0 66013.0 66013.0 84148.0 93600.0 102657.0 94573.0 89794.0 92471.0 89690.0
Bulgaria 16615.0 16615.0 16615.0 16615.0 16615.0 19678.0 28727.0 29470.0 30570.0 31586.0
Czechia 51478.0 61782.0 61069.0 55910.0 46106.0 25894.0 28468.0 25684.0 38864.0 27316.0
Denmark 38356.0 39899.0 41456.0 41593.0 43663.0 43310.0 44426.0 44625.0 52654.0 56403.0
Germany 737889.0 286582.0 252456.0 249045.0 240001.0 259328.0 324221.0 347162.0 533762.0 560700.0
In [10]:
# Pivot transformation to long-form for visual analysis
def df_to_longform(df, data_col_name):
    
    df = pd.melt(df.reset_index(), id_vars='index')

    df.columns=('country','year', data_col_name)

    df.sort_values(by=['country','year'], inplace=True)

    df['year'] = df['year'].astype(str)

    return df;
In [11]:
dfg_imm_total = df_to_longform(df_imm_total, 'immigrants')

dfg_imm_total.head()
Out[11]:
country year immigrants
19 Austria 2008 73772.0
51 Austria 2009 69295.0
83 Austria 2010 70978.0
115 Austria 2011 82230.0
147 Austria 2012 91557.0
In [12]:
dfg_imm_total.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 320 entries, 19 to 315
Data columns (total 3 columns):
country       320 non-null object
year          320 non-null object
immigrants    320 non-null float64
dtypes: float64(1), object(2)
memory usage: 10.0+ KB
In [13]:
# Dataframe to long-form for graphing
dfg_emi_total = df_to_longform(df_emi_total, 'emigrants')

dfg_emi_total.head()
Out[13]:
country year emigrants
19 Austria 2008 51563.0
51 Austria 2009 53244.0
83 Austria 2010 51651.0
115 Austria 2011 51197.0
147 Austria 2012 51812.0
In [14]:
# Combining long-form dataframes to graph
dfg = pd.merge(dfg_imm_total, dfg_emi_total, on=['country', 'year'])

dfg.head()
Out[14]:
country year immigrants emigrants
0 Austria 2008 73772.0 51563.0
1 Austria 2009 69295.0 53244.0
2 Austria 2010 70978.0 51651.0
3 Austria 2011 82230.0 51197.0
4 Austria 2012 91557.0 51812.0
In [15]:
# Style
sns.set_style('whitegrid')

# Pairplot by country
pal = sns.cubehelix_palette(10, start=0.3, rot=-0.8)
g = sns.pairplot(dfg, vars=dfg.iloc[:,2:4],
            height=5, palette=pal, kind='reg', hue='country')

# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Immigration & Emigration by Country', size=16, weight='demi');
In [16]:
# Pairplot by years
pal = sns.cubehelix_palette(10, start=0.3, rot=-0.8)
g = sns.pairplot(dfg, vars=dfg.iloc[:,2:4],
            height=5, palette=pal, kind='reg', hue='year')

# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Immigration & Emigration by Year', size=16, weight='demi');
In [17]:
# Initialize FacetGrid object
g = sns.FacetGrid(dfg_imm_total, col='country', col_wrap=4, 
                  height=2,aspect=2).set(xticks=np.arange(0,10,3))

# Create plot
g.map(plt.plot, 'year', 'immigrants',  color='darkorange')
g.map(plt.fill_between, 'year', 'immigrants',  color='darkorange', alpha=0.5)
g.map(plt.axhline, y=0, lw=2, color='darkorange')

# Facet titles
for ax in g.axes:
    g.set_titles("{col_name}")

# Formatting axes
g.set(yticks=[])
g.despine(bottom=True, left=True)

# Legend
color_key = {'Immigrants': 'darkorange'}
patches =  [pat.Patch(color=v, label=k) for k,v in color_key.items()]

g.fig.legend(handles=patches, bbox_to_anchor =
             [0.49, 0.93]).get_frame().set_edgecolor('1.0')

# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Total Immigration', size=16, weight='demi');
In [18]:
# FIRST GENERATION IMMIGRANTS REASONS DATA 2014
# https://ec.europa.eu/eurostat/cache/metadata/en/lfso_14_esms.htm
df_imm_fgen_bythous = clean_eurostat_excel('lfso_14b1dr.xlsx', 'Reason first gen imm 2014')

df_imm_fgen_bythous[df_imm_fgen_bythous.isnull().sum(axis=1) > 0]
C:\Users\Hillary\Anaconda3\lib\site-packages\ipykernel_launcher.py:20: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
Out[18]:
Total Family reasons Education reasons Work, job found before migrating Work, no job found before migrating International protection or asylum Other No response
Denmark NaN NaN NaN NaN NaN NaN NaN NaN
Ireland NaN NaN NaN NaN NaN NaN NaN NaN
Netherlands NaN NaN NaN NaN NaN NaN NaN NaN
Iceland NaN NaN NaN NaN NaN NaN NaN NaN
Liechtenstein NaN NaN NaN NaN NaN NaN NaN NaN
In [19]:
# Drop countries with no data
df_imm_fgen_bythous.dropna(how='all', inplace=True)

# Order dataframe by total first generation immigrants
df_imm_fgen_bythous.sort_values(by=['Total'], inplace=True, ascending=False)

fgen_order = df_imm_fgen_bythous.index

# Drop total - only graphing reasons
df_imm_fgen_bythous.drop('Total', axis=1, inplace=True)

# Dataframe to long-form for graphing
dfg_imm_fgen_bythous = df_to_longform(df_imm_fgen_bythous, 'x')

dfg_imm_fgen_bythous.columns = ['country', 'Reason', 'thousands']
In [20]:
# Set plots and style
sns.set(rc={'figure.figsize':(15,30)}, style='whitegrid')

# Plot
ax = sns.barplot(data=dfg_imm_fgen_bythous, x='thousands', y='country', hue='Reason',
                 palette=sns.color_palette('deep', 7), order=fgen_order, 
                 hue_order=['Family reasons', 'Work, no job found before migrating', 
                      'Work, job found before migrating', 
                       'International protection or asylum', 'Education reasons', 
                       'Other', 'No response']) #

# Legend
plt.legend(loc='center right')

# Title
ax.set_title('First Generation Immigrants: Reasons for Migration (2014)', fontsize='large', fontweight='demi', y=1.02)
plt.figtext(0.5,0.893, 'Countries in descending order of total immigrants in 2014', ha="center", va="top", fontsize=12, color='grey'); 
In [21]:
# Create boolean criteria for subplots
df_imm_fgen_bythous['Fam500k+'] = df_imm_fgen_bythous['Family reasons'] > 500
df_imm_fgen_bythous['Fam100k+'] = df_imm_fgen_bythous['Family reasons'] > 100

df_imm_fgen_bythous.head()
Out[21]:
Family reasons Education reasons Work, job found before migrating Work, no job found before migrating International protection or asylum Other No response Fam500k+ Fam100k+
Germany 3661.4 396.4 676.5 1142.0 733.0 1238.5 530.8 True True
United Kingdom 3154.4 974.4 827.7 1282.0 318.2 338.3 32.1 True True
Italy 2375.3 114.0 376.4 2039.5 23.6 65.8 7.6 True True
France 3301.5 473.8 232.2 500.1 219.4 219.5 219.5 True True
Spain 2068.3 119.7 534.8 1552.1 24.3 360.7 76.7 True True
In [22]:
# Creating sub-dataframes for zoomed-in subplots
df_imm_fgen_bythous_lar = df_imm_fgen_bythous.loc[df_imm_fgen_bythous['Fam500k+'] == True].drop(
    ['Fam500k+', 'Fam100k+'], axis=1)

df_imm_fgen_bythous_med = df_imm_fgen_bythous.loc[(df_imm_fgen_bythous['Fam500k+'] == False) & (
    df_imm_fgen_bythous['Fam100k+'] == True)].drop(
    ['Fam500k+', 'Fam100k+'], axis=1)

df_imm_fgen_bythous_sma = df_imm_fgen_bythous.loc[df_imm_fgen_bythous['Fam100k+'] == False].drop(
    ['Fam500k+', 'Fam100k+'], axis=1)

df_imm_fgen_bythous_lar.head()
Out[22]:
Family reasons Education reasons Work, job found before migrating Work, no job found before migrating International protection or asylum Other No response
Germany 3661.4 396.4 676.5 1142.0 733.0 1238.5 530.8
United Kingdom 3154.4 974.4 827.7 1282.0 318.2 338.3 32.1
Italy 2375.3 114.0 376.4 2039.5 23.6 65.8 7.6
France 3301.5 473.8 232.2 500.1 219.4 219.5 219.5
Spain 2068.3 119.7 534.8 1552.1 24.3 360.7 76.7
In [23]:
# Data to long-form for graphing
dfg_imm_fgen_bythous_lar = df_to_longform(df_imm_fgen_bythous_lar, 'x')
dfg_imm_fgen_bythous_med = df_to_longform(df_imm_fgen_bythous_med, 'x')
dfg_imm_fgen_bythous_sma = df_to_longform(df_imm_fgen_bythous_sma, 'x')

dfg_imm_fgen_bythous_lar.columns = ['country', 'Reason', 'thousands']
dfg_imm_fgen_bythous_med.columns = ['country', 'Reason', 'thousands']
dfg_imm_fgen_bythous_sma.columns = ['country', 'Reason', 'thousands']

dfg_imm_fgen_bythous_lar.head()
Out[23]:
country Reason thousands
17 Austria Education reasons 77.2
8 Austria Family reasons 608.9
44 Austria International protection or asylum 109.7
62 Austria No response 39.0
53 Austria Other 39.0
In [24]:
# Sort for graphing order
dfg_imm_fgen_bythous_lar.sort_values(by=['thousands'], ascending=False, inplace=True)

dfg_imm_fgen_bythous_med.sort_values(by=['thousands'], ascending=False, inplace=True)

dfg_imm_fgen_bythous_sma.sort_values(by=['thousands'], ascending=False, inplace=True)
In [25]:
# Set plots and style
fig, axs = plt.subplots(3, figsize=(10, 20), gridspec_kw={'height_ratios': [3, 2, 4]})

# Set style
sns.set_style('whitegrid')

# First subplot (> 500k family reasons)
p0 = sns.barplot(data=dfg_imm_fgen_bythous_lar, x='thousands', y='country', hue='Reason',
                 palette=sns.color_palette('deep', 7),
                 hue_order=['Family reasons', 'Work, no job found before migrating', 
                      'Work, job found before migrating', 
                       'International protection or asylum', 'Education reasons', 
                       'Other', 'No response'], ax=axs[0])
# Second subplot (200k > 500k family reasons)
p1 = sns.barplot(data=dfg_imm_fgen_bythous_med, x='thousands', y='country', hue='Reason',
                palette=sns.color_palette('deep', 7),
                 hue_order=['Family reasons', 'Work, no job found before migrating', 
                      'Work, job found before migrating', 
                       'International protection or asylum', 'Education reasons', 
                       'Other', 'No response'], ax=axs[1])

# Third subplot (< 200k family reasons)
p2 = sns.barplot(data=dfg_imm_fgen_bythous_sma, x='thousands', y='country', hue='Reason',
                 palette=sns.color_palette('deep', 7),
                 hue_order=['Family reasons', 'Work, no job found before migrating', 
                      'Work, job found before migrating', 
                       'International protection or asylum', 'Education reasons', 
                       'Other', 'No response'], ax=axs[2])


# Formatting
fig.tight_layout()
axs[0].set(xlabel='', ylabel='')
axs[1].set(xlabel='', ylabel='')
axs[2].set(xlabel='First generation immigrants', ylabel='')

# Legend
handles, labels = axs[2].get_legend_handles_labels()
p0.legend(loc='lower right')
p1.legend(handles[:0], labels[:0])
p2.legend(handles[:0], labels[:0])

# Title
plt.subplots_adjust(top=0.98)
fig.suptitle('First Generation Immigrants: Reasons for Migration (2014)', fontsize='large', fontweight='demi', y=1.01); 
In [26]:
dfi_imm_fgen_bythous = df_imm_fgen_bythous.reset_index()

dfi_imm_fgen_bythous.head()
Out[26]:
index Family reasons Education reasons Work, job found before migrating Work, no job found before migrating International protection or asylum Other No response Fam500k+ Fam100k+
0 Germany 3661.4 396.4 676.5 1142.0 733.0 1238.5 530.8 True True
1 United Kingdom 3154.4 974.4 827.7 1282.0 318.2 338.3 32.1 True True
2 Italy 2375.3 114.0 376.4 2039.5 23.6 65.8 7.6 True True
3 France 3301.5 473.8 232.2 500.1 219.4 219.5 219.5 True True
4 Spain 2068.3 119.7 534.8 1552.1 24.3 360.7 76.7 True True
In [27]:
# Set up PairGrid
g = sns.PairGrid(data=dfi_imm_fgen_bythous,
                x_vars=dfi_imm_fgen_bythous.columns[1:6], y_vars=['index'],
                height=10, aspect=0.3)

# Create stripplot
g.map(sns.stripplot, size=10, orient='h', palette=sns.cubehelix_palette(32, start=0.5, rot=-0.8, reverse=True),
     linewidth=1, edgecolor='w')

# Set x-axis limits on all columns
g.set(xlim=(-150, 4000), xlabel="Immigrants in thousands", ylabel="")

# Column titles
titles = ['Family', 'Education', 'Work (job prev.)', 'Work (no job prev.)', 'Asylum']

for ax, title in zip(g.axes.flat, titles):

    # Set a different title for each axes
    ax.set(title=title)

    # Make the grid horizontal instead of vertical
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)

# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('First Generation Immigrants: Reasons for Migration (2014)', size=16, weight='demi')
plt.figtext(0.5,0.95, 'Countries in descending order of total immigrants in 2014', ha="center", va="top", fontsize=14, color='grey')

sns.despine(left=True, bottom=True);
In [28]:
# Initialize FacetGrid object
g = sns.FacetGrid(dfg, col='country', col_wrap=4, 
                  height=2,aspect=2).set(xticks=np.arange(0,10,3))


# Create immigration plot - set for immigrants
g.map(plt.plot, 'year', 'immigrants', color='darkorange')
g.map(plt.fill_between, 'year', 'immigrants',  color='darkorange', alpha=0.5)

g.map(plt.plot, 'year', 'emigrants', color='darkgrey')
g.map(plt.fill_between, 'year', 'emigrants',  color='darkgrey', alpha=0.5)

# Facet titles
for ax in g.axes:
    g.set_titles("{col_name}")
    g.set_axis_labels(y_var= '')

# Formatting axes
g.set(yticks=[])
g.despine(left=True)

# Legend
color_key = {'Immigrants': 'darkorange',
            'Emigrants' : 'darkgrey'}

patches =  [pat.Patch(color=v, label=k) for k,v in color_key.items()]

g.fig.legend(handles=patches, bbox_to_anchor =
             [0.525, 0.93], ncol=2).get_frame().set_edgecolor('1.0')

# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Total Immigration & Emigration', size=16, weight='demi');
In [29]:
# REPORTING COUNTRY EMIGRATION DATA 
# https://ec.europa.eu/eurostat/cache/metadata/en/migr_immi_esms.htm
df_emi_reporting = clean_eurostat_excel('migr_emi1ctz.xlsx', 'reporting_country')

# Pivot transformation to long-form for visual analysis
dfg_emi_reporting = df_to_longform(df_emi_reporting, 'reporting_country_emi')

# Merge for graphing
dfg = pd.merge(dfg, dfg_emi_reporting, on=['country', 'year'])

dfg.isnull().sum()
Out[29]:
country                  0
year                     0
immigrants               0
emigrants                0
reporting_country_emi    0
dtype: int64
In [30]:
# Initialize FacetGrid object
g = sns.FacetGrid(dfg, col='country', col_wrap=4, 
                  height=2,aspect=2).set(xticks=np.arange(0,10,3))

# Create immigration plot - set for immigrants
g.map(plt.plot, 'year', 'emigrants', color='darkgrey')
g.map(plt.fill_between, 'year', 'emigrants',  color='darkgrey', alpha=0.5)

g.map(plt.plot, 'year', 'reporting_country_emi', color='cadetblue')
g.map(plt.fill_between, 'year', 'reporting_country_emi',  color='cadetblue', alpha=0.5)

# Facet titles
for ax in g.axes:
    g.set_titles("{col_name}")
    g.set_axis_labels(y_var= '')

# Formatting axes
g.set(yticks=[])
g.despine(left=True)

# Legend
color_key = {'Total emigrants': 'darkgrey',
            'Reporting country emigrants' : 'cadetblue'}

patches =  [pat.Patch(color=v, label=k) for k,v in color_key.items()]

g.fig.legend(handles=patches, bbox_to_anchor =
             [0.58, 0.93], ncol=2).get_frame().set_edgecolor('1.0')

# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Foreign vs. Reporting Country Emigration', size=16, weight='demi');
In [31]:
# WORLD POPULATION DATA (IN THOUSANDS)
# https://population.un.org/wpp/Download/Standard/Population/
df_pop = pd.read_excel('world_population.xlsx')
df_pop
Out[31]:
Unnamed: 0 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 ... Unnamed: 68 Unnamed: 69 Unnamed: 70 Unnamed: 71 Unnamed: 72 Unnamed: 73 Unnamed: 74 Unnamed: 75 Unnamed: 76 Unnamed: 77
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 United Nations NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 Population Division NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
300 285 Estimates Bermuda 14 60 Country/Area 918 37.256 37.8 38.437 ... 65.076 64.737 64.381 64.038 63.695 63.36 63.04 62.763 62.508 62.273
301 286 Estimates Canada NaN 124 Country/Area 918 13733.4 14078.4 14445.5 ... 34539.2 34922 35296.5 35664.3 36026.7 36382.9 36732.1 37074.6 37411 37742.2
302 287 Estimates Greenland 26 304 Country/Area 918 22.993 23.466 23.936 ... 56.555 56.477 56.412 56.383 56.378 56.408 56.473 56.565 56.66 56.772
303 288 Estimates Saint Pierre and Miquelon 2 666 Country/Area 918 4.567 4.609 4.648 ... 6.323 6.251 6.168 6.073 5.992 5.933 5.885 5.845 5.821 5.795
304 289 Estimates United States of America 35 840 Country/Area 918 158804 160872 163266 ... 311584 314044 316401 318673 320878 323016 325085 327096 329065 331003

305 rows × 78 columns

In [32]:
# Select subset of dataframe
df_pop = df_pop.iloc[15:, 2:]
df_pop.head()
Out[32]:
Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 ... Unnamed: 68 Unnamed: 69 Unnamed: 70 Unnamed: 71 Unnamed: 72 Unnamed: 73 Unnamed: 74 Unnamed: 75 Unnamed: 76 Unnamed: 77
15 Region, subregion, country or area * Notes Country code Type Parent code 1950 1951 1952 1953 1954 ... 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
16 WORLD NaN 900 World 0 2.53643e+06 2.58403e+06 2.63086e+06 2.67761e+06 2.72485e+06 ... 7.04119e+06 7.12583e+06 7.21058e+06 7.29529e+06 7.3798e+06 7.46402e+06 7.54786e+06 7.63109e+06 7.71347e+06 7.7948e+06
17 UN development groups a 1803 Label/Separator 900 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18 More developed regions b 901 Development Group 1803 814819 824004 833720 843788 854060 ... 1.23956e+06 1.24411e+06 1.24845e+06 1.25262e+06 1.25662e+06 1.26048e+06 1.26415e+06 1.26756e+06 1.27063e+06 1.2733e+06
19 Less developed regions c 902 Development Group 1803 1.72161e+06 1.76003e+06 1.79714e+06 1.83382e+06 1.87079e+06 ... 5.80164e+06 5.88171e+06 5.96213e+06 6.04268e+06 6.12317e+06 6.20354e+06 6.28371e+06 6.36353e+06 6.44284e+06 6.52149e+06

5 rows × 76 columns

In [33]:
# Create new column header
df_pop.columns = df_pop.iloc[0]
df_pop = df_pop[1:]

df_pop.head()
Out[33]:
15 Region, subregion, country or area * Notes Country code Type Parent code 1950 1951 1952 1953 1954 ... 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
16 WORLD NaN 900 World 0 2.53643e+06 2.58403e+06 2.63086e+06 2.67761e+06 2.72485e+06 ... 7.04119e+06 7.12583e+06 7.21058e+06 7.29529e+06 7.3798e+06 7.46402e+06 7.54786e+06 7.63109e+06 7.71347e+06 7.7948e+06
17 UN development groups a 1803 Label/Separator 900 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
18 More developed regions b 901 Development Group 1803 814819 824004 833720 843788 854060 ... 1.23956e+06 1.24411e+06 1.24845e+06 1.25262e+06 1.25662e+06 1.26048e+06 1.26415e+06 1.26756e+06 1.27063e+06 1.2733e+06
19 Less developed regions c 902 Development Group 1803 1.72161e+06 1.76003e+06 1.79714e+06 1.83382e+06 1.87079e+06 ... 5.80164e+06 5.88171e+06 5.96213e+06 6.04268e+06 6.12317e+06 6.20354e+06 6.28371e+06 6.36353e+06 6.44284e+06 6.52149e+06
20 Least developed countries d 941 Development Group 902 195428 199180 203015 206986 211133 ... 856471 876867 897793 919223 941131 963520 986385 1.00969e+06 1.03339e+06 1.05744e+06

5 rows × 76 columns

In [34]:
# Cleaning up index
df_pop.set_index('Region, subregion, country or area *',inplace=True)
df_pop.drop(['Notes','Country code','Type','Parent code'],axis=1, inplace=True)
df_pop.head()
Out[34]:
15 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 ... 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
Region, subregion, country or area *
WORLD 2.53643e+06 2.58403e+06 2.63086e+06 2.67761e+06 2.72485e+06 2.77302e+06 2.82244e+06 2.87331e+06 2.92569e+06 2.97958e+06 ... 7.04119e+06 7.12583e+06 7.21058e+06 7.29529e+06 7.3798e+06 7.46402e+06 7.54786e+06 7.63109e+06 7.71347e+06 7.7948e+06
UN development groups ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
More developed regions 814819 824004 833720 843788 854060 864430 874824 885207 895576 905949 ... 1.23956e+06 1.24411e+06 1.24845e+06 1.25262e+06 1.25662e+06 1.26048e+06 1.26415e+06 1.26756e+06 1.27063e+06 1.2733e+06
Less developed regions 1.72161e+06 1.76003e+06 1.79714e+06 1.83382e+06 1.87079e+06 1.90859e+06 1.94762e+06 1.9881e+06 2.03011e+06 2.07363e+06 ... 5.80164e+06 5.88171e+06 5.96213e+06 6.04268e+06 6.12317e+06 6.20354e+06 6.28371e+06 6.36353e+06 6.44284e+06 6.52149e+06
Least developed countries 195428 199180 203015 206986 211133 215486 220061 224862 229882 235106 ... 856471 876867 897793 919223 941131 963520 986385 1.00969e+06 1.03339e+06 1.05744e+06

5 rows × 71 columns

In [35]:
# Get rid of 14 in corner and index name
del df_pop.columns.name
del df_pop.index.name

df_pop.head()
Out[35]:
1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 ... 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
WORLD 2.53643e+06 2.58403e+06 2.63086e+06 2.67761e+06 2.72485e+06 2.77302e+06 2.82244e+06 2.87331e+06 2.92569e+06 2.97958e+06 ... 7.04119e+06 7.12583e+06 7.21058e+06 7.29529e+06 7.3798e+06 7.46402e+06 7.54786e+06 7.63109e+06 7.71347e+06 7.7948e+06
UN development groups ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
More developed regions 814819 824004 833720 843788 854060 864430 874824 885207 895576 905949 ... 1.23956e+06 1.24411e+06 1.24845e+06 1.25262e+06 1.25662e+06 1.26048e+06 1.26415e+06 1.26756e+06 1.27063e+06 1.2733e+06
Less developed regions 1.72161e+06 1.76003e+06 1.79714e+06 1.83382e+06 1.87079e+06 1.90859e+06 1.94762e+06 1.9881e+06 2.03011e+06 2.07363e+06 ... 5.80164e+06 5.88171e+06 5.96213e+06 6.04268e+06 6.12317e+06 6.20354e+06 6.28371e+06 6.36353e+06 6.44284e+06 6.52149e+06
Least developed countries 195428 199180 203015 206986 211133 215486 220061 224862 229882 235106 ... 856471 876867 897793 919223 941131 963520 986385 1.00969e+06 1.03339e+06 1.05744e+06

5 rows × 71 columns

In [36]:
# Reduce dataset to select countries and years
df_pop = df_pop.loc[countries].astype('int')

df_pop = df_pop.loc[:, '2008': '2017']

# Replace ':' with NaNs and set datatype
df_pop = df_pop.replace(':', np.nan).bfill(axis=1).ffill(axis=1)

df_pop.head()
Out[36]:
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
Belgium 10778 10859 10938 11013 11085 11154 11221 11287 11354 11419
Bulgaria 7524 7473 7425 7378 7334 7290 7245 7199 7151 7102
Czechia 10425 10488 10536 10566 10581 10586 10591 10601 10618 10641
Denmark 5497 5526 5554 5582 5610 5638 5664 5688 5711 5732
Germany 81065 80899 80827 80855 80972 81174 81450 81787 82193 82658
In [37]:
# Push index into column for graphing
dfi_pop = df_pop.reset_index()

# Drop columns for graphing
dfi_pop = dfi_pop.drop(['2009','2010','2012','2013', '2015', '2016'],axis=1)

dfi_pop.head()
Out[37]:
index 2008 2011 2014 2017
0 Belgium 10778 11013 11221 11419
1 Bulgaria 7524 7378 7245 7102
2 Czechia 10425 10566 10591 10641
3 Denmark 5497 5582 5664 5732
4 Germany 81065 80855 81450 82658
In [38]:
# Set up PairGrid
g = sns.PairGrid(data=dfi_pop.sort_values('2008', ascending=False),
                x_vars=dfi_pop.columns[1:], y_vars=['index'],
                height=10, aspect=0.3)

# Create stripplot
g.map(sns.stripplot, size=10, orient='h', palette=sns.cubehelix_palette(32, start=0.5, rot=-0.8, reverse=True),
     linewidth=1, edgecolor='w')

# Set x-axis limits on all columns
g.set(xlim=(-10000, 100000), xlabel="Population in thousands", ylabel="")

# Column titles
titles = ['2008', '2011', '2014', '2017']

for ax, title in zip(g.axes.flat, titles):

    # Set a different title for each axes
    ax.set(title=title)

    # Make the grid horizontal instead of vertical
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)

# Title
plt.subplots_adjust(top=0.85)
g.fig.suptitle('Population by Country', size=16, weight='demi')

sns.despine(left=True, bottom=True);
In [39]:
# Pivot data to long-form for visual analysis
dfg_pop = df_to_longform(df_pop, 'pop_in_thous')

dfg = pd.merge(dfg, dfg_pop, on=['country', 'year'])

dfg.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 320 entries, 0 to 319
Data columns (total 6 columns):
country                  320 non-null object
year                     320 non-null object
immigrants               320 non-null float64
emigrants                320 non-null float64
reporting_country_emi    320 non-null float64
pop_in_thous             320 non-null int64
dtypes: float64(3), int64(1), object(2)
memory usage: 17.5+ KB
In [40]:
# Create immigrants/emigrants per capita data
dfg['immigrants_per_capita'] = dfg['immigrants']/dfg['pop_in_thous']
dfg['emigrants_per_capita'] = dfg['emigrants']/dfg['pop_in_thous']

dfg.head()
Out[40]:
coun