Let's import the packages we will use and set the paths for outputs.
# Let's import pandas and some other basic packages we will use
from __future__ import division
import pandas as pd
import numpy as np
import os, sys
# GIS packages
import geopandas as gpd
from geopandas.tools import overlay
from shapely.geometry import Polygon, Point
import georasters as gr
# Alias for Geopandas
gp = gpd
# Plotting
import matplotlib as mpl
import seaborn as sns
# Setup seaborn
sns.set()
# Mapping
import geoplot as gplt
import geoplot.crs as gcrs
import mapclassify as mc
import textwrap
%pylab --no-import-all
%matplotlib inline
Using matplotlib backend: <object object at 0x195b1fa70> %pylab is deprecated, use %matplotlib inline and import the required libraries. Populating the interactive namespace from numpy and matplotlib
# Functions for plotting
def center_wrap(text, cwidth=32, **kw):
'''Center Text (to be used in legend)'''
lines = text
#lines = textwrap.wrap(text, **kw)
return "\n".join(line.center(cwidth) for line in lines)
def MyChloropleth(mydf, myfile='', myvar='',
mylegend='',
k=5,
extent=[-180, -90, 180, 90],
bbox_to_anchor=(0.2, 0.5),
edgecolor='white', facecolor='lightgray',
scheme='FisherJenks', bins=None, pct=None,
legend_labels=None,
save=True,
percent=False,
cmap='Reds',
**kwargs):
# Chloropleth
# Color scheme
if scheme=='EqualInterval':
scheme = mc.EqualInterval(mydf[myvar], k=k)
elif scheme=='Quantiles':
scheme = mc.Quantiles(mydf[myvar], k=k)
elif scheme=='BoxPlot':
scheme = mc.BoxPlot(mydf[myvar], k=k)
elif scheme=='FisherJenks':
scheme = mc.FisherJenks(mydf[myvar], k=k)
elif scheme=='FisherJenksSampled':
scheme = mc.FisherJenksSampled(mydf[myvar], k=k)
elif scheme=='HeadTailBreaks':
scheme = mc.HeadTailBreaks(mydf[myvar], k=k)
elif scheme=='JenksCaspall':
scheme = mc.JenksCaspall(mydf[myvar], k=k)
elif scheme=='JenksCaspallForced':
scheme = mc.JenksCaspallForced(mydf[myvar], k=k)
elif scheme=='JenksCaspallSampled':
scheme = mc.JenksCaspallSampled(mydf[myvar], k=k)
elif scheme=='KClassifiers':
scheme = mc.KClassifiers(mydf[myvar], k=k)
elif scheme=='Percentiles':
scheme = mc.Percentiles(mydf[myvar], pct=pct)
elif scheme=='UserDefined':
scheme = mc.UserDefined(mydf[myvar], bins=bins)
if legend_labels is None:
# Format legend
upper_bounds = scheme.bins
# get and format all bounds
bounds = []
for index, upper_bound in enumerate(upper_bounds):
if index == 0:
lower_bound = mydf[myvar].min()
else:
lower_bound = upper_bounds[index-1]
# format the numerical legend here
if percent:
bound = f'{lower_bound:.0%} - {upper_bound:.0%}'
else:
bound = f'{float(lower_bound):,.0f} - {float(upper_bound):,.0f}'
bounds.append(bound)
legend_labels = bounds
#Plot
ax = gplt.choropleth(
mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor='white', linewidth=1,
cmap=cmap, legend=True,
scheme=scheme,
legend_kwargs={'bbox_to_anchor': bbox_to_anchor,
'frameon': True,
'title':mylegend,
},
legend_labels = legend_labels,
figsize=(24, 16),
rasterized=True,
)
gplt.polyplot(
countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
edgecolor=edgecolor, facecolor=facecolor,
ax=ax,
rasterized=True,
extent=extent,
)
if save:
plt.savefig(pathgraphs + myfile + '_' + myvar +'.pdf', dpi=300, bbox_inches='tight')
plt.savefig(pathgraphs + myfile + '_' + myvar +'.png', dpi=300, bbox_inches='tight')
pass
# Paths
pathout = './data/'
if not os.path.exists(pathout):
os.mkdir(pathout)
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
os.mkdir(pathgraphs)
The Colombian Cancillery's website has a list with visa requirements for colombians. Let's use it to map countries for which visas are not required. Below is the link to the information. The problem is that it is a pdf file. Let's open the website and check it out
# Import display options for showing websites
from IPython.display import IFrame
url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
IFrame(url, width=800, height=400)
# Import package for downloading internet content and save it to file
import requests
url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
response = requests.get(url)
with open(pathout + 'visas.pdf', 'wb') as f:
f.write(response.content)
# Import package to read pdf tables
import camelot
visas = camelot.read_pdf(pathout + 'visas.pdf', pages='1-7')
Let's explore the visas object
visas
<TableList n=7>
So there are 7 tables in visas. What does Table 1 have?
visas[0]
<Table shape=(28, 3)>
visas[0].df
0 | 1 | 2 | |
---|---|---|---|
0 | MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA | ||
1 | DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y... | ||
2 | COORDINACION DE VISAS E INMIGRACION | ||
3 | Estados y territorios que exigen o NO visas a ... | ||
4 | EXIGEN VISA A | ||
5 | PAIS | SI | NO |
6 | Afganistán | X | |
7 | Albania | X | |
8 | Alemania | X | |
9 | Andorra | X | |
10 | Angola | X | |
11 | Antigua y Barbuda | X | |
12 | Arabia Saudita | X | |
13 | Argelia | X | |
14 | Argentina | X | |
15 | Armenia | ||
16 | Australia | X X | |
17 | Austria | X | |
18 | Azerbaiyán | X (Visa electrónica) | |
19 | Bahamas | X | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | |
21 | Bangladesh | X | |
22 | Barbados | X | |
23 | Bélgica | X | |
24 | Belice | X | |
25 | Benin | ||
26 | Belarús | X X | |
27 | Bolivia | X |
Ok, let's concatenate all these pandas
dataframes.
visadf = pd.concat([i.df for i in visas]).reset_index(drop=True)
visadf
0 | 1 | 2 | |
---|---|---|---|
0 | MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA | ||
1 | DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y... | ||
2 | COORDINACION DE VISAS E INMIGRACION | ||
3 | Estados y territorios que exigen o NO visas a ... | ||
4 | EXIGEN VISA A | ||
... | ... | ... | ... |
220 | Taiwan | X Visa electrónica | |
221 | Wallis y Futuna (Francia) | X | |
222 | |||
223 | Actualización 21 -10-2019 | ||
224 | El presente cuadro presenta generalidades sobr... |
225 rows × 3 columns
visadf.columns = visadf.iloc[5]
visadf.head(10)
5 | PAIS | SI | NO |
---|---|---|---|
0 | MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA | ||
1 | DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y... | ||
2 | COORDINACION DE VISAS E INMIGRACION | ||
3 | Estados y territorios que exigen o NO visas a ... | ||
4 | EXIGEN VISA A | ||
5 | PAIS | SI | NO |
6 | Afganistán | X | |
7 | Albania | X | |
8 | Alemania | X | |
9 | Andorra | X |
visadf = visadf.iloc[6:].copy()
visadf.columns.name = ''
visadf.head(10)
PAIS | SI | NO | |
---|---|---|---|
6 | Afganistán | X | |
7 | Albania | X | |
8 | Alemania | X | |
9 | Andorra | X | |
10 | Angola | X | |
11 | Antigua y Barbuda | X | |
12 | Arabia Saudita | X | |
13 | Argelia | X | |
14 | Argentina | X | |
15 | Armenia |
Let's code SI (YES) as 1 and NO as 0
visadf['visa_req'] = visadf.SI.map({'X':1, '':0})
Let's check whether things were mapped correctly
visadf.loc[visadf.visa_req.isna()]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
16 | Australia | X X | NaN | |
18 | Azerbaiyán | X (Visa electrónica) | NaN | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | NaN | |
26 | Belarús | X X | NaN | |
34 | Burundi | X X X | NaN | |
36 | Cabo Verde | X (Visa a la llegada) | NaN | |
37 | Camboya | X (Visa a la llegada) | NaN | |
39 | Canadá | X X X | NaN | |
46 | Congo | X X X | NaN | |
50 | Costa de Marfil | X X | NaN | |
58 | Egipto | X (Visa a la llegada) | NaN | |
68 | Fiji | X X | NaN | |
76 | Granada | X X | NaN | |
80 | Guinea-Bissau | X X X | NaN | |
88 | Irán | X X X X X | NaN | |
93 | Islas Salomón | X X | NaN | |
98 | Jordania | X X | NaN | |
100 | Kenia | X Visa a la llegada | NaN | |
102 | Kiribati | X X | NaN | |
105 | Laos República Democrática P | X Visa a la llegada | NaN | |
110 | Libia | X X | NaN | |
116 | Malasia | X X X | NaN | |
122 | Mauricio | X X X | NaN | |
131 | Myanmar | X (Visa a la llegada) | NaN | |
135 | Nicaragua | X (visa a la llegada para titulares de visa de... | NaN | |
137 | Nigeria | X X | NaN | |
140 | Omán | X (Visa de turismo al ingreso a Omán en los pu... | NaN | |
143 | Palau | X X | NaN | |
156 | Ruanda | X (Visa electrónica) | NaN | |
167 | Sierra Leona | X X | NaN | |
172 | Sudáfrica | X X X X X X | NaN | |
179 | Tailandia | X X | NaN | |
180 | Tanzania | X Visa a la llegada | NaN | |
183 | Togo | X X X X | NaN | |
194 | Vanuatu | X X | NaN | |
197 | Yemen | X X X | NaN | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | NaN | |
220 | Taiwan | X Visa electrónica | NaN |
IFrame(url, width=800, height=400)
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(1)=='X X') | (visadf.SI.shift(-1)=='X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
15 | Armenia | 0.0 | ||
16 | Australia | X X | NaN | |
17 | Austria | X | 0.0 | |
25 | Benin | 0.0 | ||
26 | Belarús | X X | NaN | |
27 | Bolivia | X | 0.0 | |
49 | Corea República Popular Dem. | 0.0 | ||
50 | Costa de Marfil | X X | NaN | |
51 | Costa Rica | X | A titulares de Visa de EE UU o Schengen vigen... | 1.0 |
67 | Etiopía | 0.0 | ||
68 | Fiji | X X | NaN | |
69 | Filipinas | X Hasta por 30 días | 0.0 | |
75 | Ghana | 0.0 | ||
76 | Granada | X X | NaN | |
77 | Grecia | X | 0.0 | |
92 | Islas Marshall | 0.0 | ||
93 | Islas Salomón | X X | NaN | |
94 | Israel | X | 0.0 | |
97 | Japón | 0.0 | ||
98 | Jordania | X X | NaN | |
99 | Kazajstán | X (Hasta por 30 días) | 0.0 | |
101 | Kirguistán | 0.0 | ||
102 | Kiribati | X X | NaN | |
103 | Kuwait | X | 1.0 | |
109 | Liberia | 0.0 | ||
110 | Libia | X X | NaN | |
111 | Liechtenstein | X | 0.0 | |
136 | Níger | 0.0 | ||
137 | Nigeria | X X | NaN | |
138 | Noruega | X | 0.0 | |
142 | Pakistán | 0.0 | ||
143 | Palau | X X | NaN | |
144 | Panamá | X | 0.0 | |
166 | Seychelles | 0.0 | ||
167 | Sierra Leona | X X | NaN | |
168 | Singapur | X Hasta por 30 días | 0.0 | |
178 | Suazilandia | 0.0 | ||
179 | Tailandia | X X | NaN | |
180 | Tanzania | X Visa a la llegada | NaN | |
193 | Uzbekistán | 0.0 | ||
194 | Vanuatu | X X | NaN | |
195 | Venezuela | X | 0.0 |
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
33 | Burkina Faso | 0.0 | ||
34 | Burundi | X X X | NaN | |
35 | Bután | 0.0 | ||
38 | Camerún | 0.0 | ||
39 | Canadá | X X X | NaN | |
40 | Chad | 0.0 | ||
45 | Comoras | 0.0 | ||
46 | Congo | X X X | NaN | |
47 | Congo República Democrática | 0.0 | ||
79 | Guinea | 0.0 | ||
80 | Guinea-Bissau | X X X | NaN | |
81 | Guinea Ecuatorial | 0.0 | ||
115 | Madagascar | 0.0 | ||
116 | Malasia | X X X | NaN | |
117 | Malawi | 0.0 | ||
121 | Marruecos | 0.0 | ||
122 | Mauricio | X X X | NaN | |
123 | Mauritania | 0.0 | ||
196 | Vietnam | 0.0 | ||
197 | Yemen | X X X | NaN | |
198 | Zambia | 0.0 |
Ok it seems we have two types of errors. First, notince that sometimes the type of visa is defined, e.g., Azerbayán. Second, the OCR software has mixed some rows, so that now we have XX, XXX, etc. Looking at the pdf it seems this is due to assigning an X from a previous row to the current row ("X X") or from both the previous and next ("X X X"). Let's try to correct these errors programatically (obviously sometimes it may just be faster and better to export the dataframe, correct it by hand snd then load the corrected one, but we're here to learn, right?).
First, let's replace the repeated X with what seems to be the correct data.
X X
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
15 | Armenia | 1.0 | ||
16 | Australia | X X | 1.0 | |
25 | Benin | 1.0 | ||
26 | Belarús | X X | 1.0 | |
49 | Corea República Popular Dem. | 1.0 | ||
50 | Costa de Marfil | X X | 1.0 | |
67 | Etiopía | 1.0 | ||
68 | Fiji | X X | 1.0 | |
75 | Ghana | 1.0 | ||
76 | Granada | X X | 1.0 | |
92 | Islas Marshall | 1.0 | ||
93 | Islas Salomón | X X | 1.0 | |
97 | Japón | 1.0 | ||
98 | Jordania | X X | 1.0 | |
101 | Kirguistán | 1.0 | ||
102 | Kiribati | X X | 1.0 | |
109 | Liberia | 1.0 | ||
110 | Libia | X X | 1.0 | |
136 | Níger | 1.0 | ||
137 | Nigeria | X X | 1.0 | |
142 | Pakistán | 1.0 | ||
143 | Palau | X X | 1.0 | |
166 | Seychelles | 1.0 | ||
167 | Sierra Leona | X X | 1.0 | |
178 | Suazilandia | 1.0 | ||
179 | Tailandia | X X | 1.0 | |
193 | Uzbekistán | 1.0 | ||
194 | Vanuatu | X X | 1.0 |
X X X
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X'), 'visa_req'] =1
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X') | (visadf.SI.shift(-1)=='X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
33 | Burkina Faso | 1.0 | ||
34 | Burundi | X X X | 1.0 | |
35 | Bután | 1.0 | ||
38 | Camerún | 1.0 | ||
39 | Canadá | X X X | 1.0 | |
40 | Chad | 1.0 | ||
45 | Comoras | 1.0 | ||
46 | Congo | X X X | 1.0 | |
47 | Congo República Democrática | 1.0 | ||
79 | Guinea | 1.0 | ||
80 | Guinea-Bissau | X X X | 1.0 | |
81 | Guinea Ecuatorial | 1.0 | ||
115 | Madagascar | 1.0 | ||
116 | Malasia | X X X | 1.0 | |
117 | Malawi | 1.0 | ||
121 | Marruecos | 1.0 | ||
122 | Mauricio | X X X | 1.0 | |
123 | Mauritania | 1.0 | ||
196 | Vietnam | 1.0 | ||
197 | Yemen | X X X | 1.0 | |
198 | Zambia | 1.0 |
X X X X
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(2)=='X X X X') | (visadf.SI.shift(-2)=='X X X X') | (visadf.SI.shift(-3)=='X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
180 | Tanzania | X Visa a la llegada | NaN | |
181 | Tayikistán | 0.0 | ||
182 | Timor Oriental | 0.0 | ||
183 | Togo | X X X X | NaN | |
184 | Tonga | 0.0 | ||
185 | Trinidad y Tobago | X | 0.0 |
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X') | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
181 | Tayikistán | 1.0 | ||
182 | Timor Oriental | 1.0 | ||
183 | Togo | X X X X | 1.0 | |
184 | Tonga | 1.0 |
X X X X X
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
86 | India | 0.0 | ||
87 | Indonesia | 0.0 | ||
88 | Irán | X X X X X | NaN | |
89 | Iraq | 0.0 | ||
90 | Irlanda | 0.0 |
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X') | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
86 | India | 1.0 | ||
87 | Indonesia | 1.0 | ||
88 | Irán | X X X X X | 1.0 | |
89 | Iraq | 1.0 | ||
90 | Irlanda | 1.0 |
X X X X X X
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X') | (visadf.SI.shift(3)=='X X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
169 | Siria | 0.0 | ||
170 | Somalia | 0.0 | ||
171 | Sri Lanka | 0.0 | ||
172 | Sudáfrica | X X X X X X | NaN | |
173 | Sudán del Sur | 0.0 | ||
174 | Sudán | 0.0 | ||
175 | Suecia | X | 0.0 |
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X') | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X')]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
169 | Siria | 1.0 | ||
170 | Somalia | 1.0 | ||
171 | Sri Lanka | 1.0 | ||
172 | Sudáfrica | X X X X X X | 1.0 | |
173 | Sudán del Sur | 1.0 | ||
174 | Sudán | 1.0 |
Let's also replace visa required for any row that has the word "visa".
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
18 | Azerbaiyán | X (Visa electrónica) | NaN | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | NaN | |
36 | Cabo Verde | X (Visa a la llegada) | NaN | |
37 | Camboya | X (Visa a la llegada) | NaN | |
58 | Egipto | X (Visa a la llegada) | NaN | |
100 | Kenia | X Visa a la llegada | NaN | |
105 | Laos República Democrática P | X Visa a la llegada | NaN | |
131 | Myanmar | X (Visa a la llegada) | NaN | |
135 | Nicaragua | X (visa a la llegada para titulares de visa de... | NaN | |
140 | Omán | X (Visa de turismo al ingreso a Omán en los pu... | NaN | |
156 | Ruanda | X (Visa electrónica) | NaN | |
180 | Tanzania | X Visa a la llegada | NaN | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | NaN | |
220 | Taiwan | X Visa electrónica | NaN |
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1, 'visa_req'] = 1
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]
PAIS | SI | NO | visa_req | |
---|---|---|---|---|
18 | Azerbaiyán | X (Visa electrónica) | 1.0 | |
20 | Bahréin | X (visa a la llegada y visa electrónica) | 1.0 | |
36 | Cabo Verde | X (Visa a la llegada) | 1.0 | |
37 | Camboya | X (Visa a la llegada) | 1.0 | |
58 | Egipto | X (Visa a la llegada) | 1.0 | |
100 | Kenia | X Visa a la llegada | 1.0 | |
105 | Laos República Democrática P | X Visa a la llegada | 1.0 | |
131 | Myanmar | X (Visa a la llegada) | 1.0 | |
135 | Nicaragua | X (visa a la llegada para titulares de visa de... | 1.0 | |
140 | Omán | X (Visa de turismo al ingreso a Omán en los pu... | 1.0 | |
156 | Ruanda | X (Visa electrónica) | 1.0 | |
180 | Tanzania | X Visa a la llegada | 1.0 | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | 1.0 | |
220 | Taiwan | X Visa electrónica | 1.0 |
Let's check again
visadf.loc[visadf.visa_req.isna()]
PAIS | SI | NO | visa_req |
---|
Ok, it seems we have coded which countries need and which do not need visa for colombian citizens. Let's analyze this data a bit.
visadf['visa_req_YN'] = visadf.visa_req.map({0:'NO', 1:'YES'})
visadf
PAIS | SI | NO | visa_req | visa_req_YN | |
---|---|---|---|---|---|
6 | Afganistán | X | 1.0 | YES | |
7 | Albania | X | 0.0 | NO | |
8 | Alemania | X | 0.0 | NO | |
9 | Andorra | X | 0.0 | NO | |
10 | Angola | X | 1.0 | YES | |
... | ... | ... | ... | ... | ... |
220 | Taiwan | X Visa electrónica | 1.0 | YES | |
221 | Wallis y Futuna (Francia) | X | 0.0 | NO | |
222 | 0.0 | NO | |||
223 | Actualización 21 -10-2019 | 0.0 | NO | ||
224 | El presente cuadro presenta generalidades sobr... | 0.0 | NO |
219 rows × 5 columns
visadf.hist()
visadf.visa_req.describe()
count 219.000000 mean 0.547945 std 0.498836 min 0.000000 25% 0.000000 50% 1.000000 75% 1.000000 max 1.000000 Name: visa_req, dtype: float64
df = visadf.groupby('visa_req_YN').count().reset_index()
df
visa_req_YN | PAIS | SI | NO | visa_req | |
---|---|---|---|---|---|
0 | NO | 99 | 99 | 99 | 99 |
1 | YES | 120 | 120 | 120 | 120 |
sns.set(rc={'figure.figsize':(11.7,8.27)})
#sns.reset_orig()
sns.set_context("talk")
# Plot
fig, ax = plt.subplots()
sns.barplot(x='visa_req_YN', y='visa_req', data=df, alpha=1)
ax.tick_params(axis = 'both', which = 'major')
ax.tick_params(axis = 'both', which = 'minor')
ax.set_xlabel('Visa Required')
ax.set_ylabel('Number of Countries')
Text(0, 0.5, 'Number of Countries')
Let's try to map these countries. First let's get the Natural Earth shapefile.
import requests
import io
#headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
url = 'https://naturalearth.s3.amazonaws.com/10m_cultural/ne_10m_admin_0_countries.zip'
r = requests.get(url, headers=headers)
countries = gp.read_file(io.BytesIO(r.content))
#countries = gpd.read_file('https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip')
countries
featurecla | scalerank | LABELRANK | SOVEREIGNT | SOV_A3 | ADM0_DIF | LEVEL | TYPE | TLC | ADMIN | ... | FCLASS_TR | FCLASS_ID | FCLASS_PL | FCLASS_GR | FCLASS_IT | FCLASS_NL | FCLASS_SE | FCLASS_BD | FCLASS_UA | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Admin-0 country | 0 | 2 | Indonesia | IDN | 0 | 2 | Sovereign country | 1 | Indonesia | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((117.70361 4.16341, 117.70361 4... |
1 | Admin-0 country | 0 | 3 | Malaysia | MYS | 0 | 2 | Sovereign country | 1 | Malaysia | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((117.70361 4.16341, 117.69711 4... |
2 | Admin-0 country | 0 | 2 | Chile | CHL | 0 | 2 | Sovereign country | 1 | Chile | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-69.51009 -17.50659, -69.50611... |
3 | Admin-0 country | 0 | 3 | Bolivia | BOL | 0 | 2 | Sovereign country | 1 | Bolivia | ... | None | None | None | None | None | None | None | None | None | POLYGON ((-69.51009 -17.50659, -69.51009 -17.5... |
4 | Admin-0 country | 0 | 2 | Peru | PER | 0 | 2 | Sovereign country | 1 | Peru | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((-69.51009 -17.50659, -69.63832... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
253 | Admin-0 country | 0 | 4 | China | CH1 | 1 | 2 | Country | 1 | Macao S.A.R | ... | None | None | None | None | None | None | None | None | None | MULTIPOLYGON (((113.55860 22.16303, 113.56943 ... |
254 | Admin-0 country | 6 | 5 | Australia | AU1 | 1 | 2 | Dependency | 1 | Ashmore and Cartier Islands | ... | None | None | None | None | None | None | None | None | None | POLYGON ((123.59702 -12.42832, 123.59775 -12.4... |
255 | Admin-0 country | 6 | 8 | Bajo Nuevo Bank (Petrel Is.) | BJN | 0 | 2 | Indeterminate | 1 | Bajo Nuevo Bank (Petrel Is.) | ... | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | POLYGON ((-79.98929 15.79495, -79.98782 15.796... |
256 | Admin-0 country | 6 | 5 | Serranilla Bank | SER | 0 | 2 | Indeterminate | 1 | Serranilla Bank | ... | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | Unrecognized | POLYGON ((-78.63707 15.86209, -78.64041 15.864... |
257 | Admin-0 country | 6 | 6 | Scarborough Reef | SCR | 0 | 2 | Indeterminate | 1 | Scarborough Reef | ... | None | None | None | None | None | None | None | None | None | POLYGON ((117.75389 15.15437, 117.75569 15.151... |
258 rows × 169 columns
Luckily there are country names in Spanish. Let's see if we can merge these two data sets.
countries.NAME_ES
0 Indonesia 1 Malasia 2 Chile 3 Bolivia 4 Perú ... 253 Macao 254 Islas Ashmore y Cartier 255 Bajo Nuevo 256 Isla Serranilla 257 Bajo de Masinloc Name: NAME_ES, Length: 258, dtype: object
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChloropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)
So it seems not everything merged correctly
col_visa.shape
(164, 174)
visadf.shape
(219, 5)
col_visa.loc[col_visa.visa_req.isna(), 'NAME_ES'].sort_values()
Series([], Name: NAME_ES, dtype: object)
So we are not linking all countries. This is usually due to symbols like accents and ~, but in this case also because the tail of the data frame includes territories of countries, so their names are non-standard (and OCR may have made some mistakes).
visadf.tail(25)
PAIS | SI | NO | visa_req | visa_req_YN | |
---|---|---|---|---|---|
200 | OTROS TERRITORIOS | 0.0 | NO | ||
201 | Aruba (Países Bajos) | X | 0.0 | NO | |
202 | Bonaire (Países Bajos) | X | 0.0 | NO | |
203 | Curazao (Países Bajos) | X | 0.0 | NO | |
204 | Guadalupe (Francia) | X | 0.0 | NO | |
205 | Guyana Francesa | X | 0.0 | NO | |
206 | Hong Kong (SARG-China) | X Por 90 días | 0.0 | NO | |
207 | Macao (SARG-China) (*) | X Visa a la llegada | 1.0 | YES | |
208 | Martinica (Francia) | X | 0.0 | NO | |
209 | Mayotte (Francia) | X | 0.0 | NO | |
210 | Nueva Caledonia (Francia) | X | 0.0 | NO | |
211 | Palestina | X | 1.0 | YES | |
212 | Polinesia Francesa | X | 0.0 | NO | |
213 | Réunion (Francia) | X | 0.0 | NO | |
214 | Saba (Países Bajos) | X | 0.0 | NO | |
215 | Saint Barthélémy (Francia) | X | 1.0 | YES | |
216 | Saint Pïerre et Miquelon (Francia) | X | 0.0 | NO | |
217 | Saint Martin (Francia) | X | 1.0 | YES | |
218 | Sint Maarten (Países Bajos) | X | 0.0 | NO | |
219 | Sint Eustatius (Países Bajos) | X | 0.0 | NO | |
220 | Taiwan | X Visa electrónica | 1.0 | YES | |
221 | Wallis y Futuna (Francia) | X | 0.0 | NO | |
222 | 0.0 | NO | |||
223 | Actualización 21 -10-2019 | 0.0 | NO | ||
224 | El presente cuadro presenta generalidades sobr... | 0.0 | NO |
Let's correct the country names to improve matching. It's always a good practice to keep the original names.
visadf['PAIS_OR'] = visadf.PAIS
visadf.loc[visadf.PAIS.str.find('(')!=-1, 'PAIS'] = visadf.loc[visadf.PAIS_OR.str.find('(')!=-1, 'PAIS_OR'].apply(lambda x: x[:x.find('(')])
visadf.PAIS = visadf.PAIS.str.strip()
visadf.tail(30)
PAIS | SI | NO | visa_req | visa_req_YN | PAIS_OR | |
---|---|---|---|---|---|---|
195 | Venezuela | X | 0.0 | NO | Venezuela | |
196 | Vietnam | 1.0 | YES | Vietnam | ||
197 | Yemen | X X X | 1.0 | YES | Yemen | |
198 | Zambia | 1.0 | YES | Zambia | ||
199 | Zimbabwe | X | 1.0 | YES | Zimbabwe | |
200 | OTROS TERRITORIOS | 0.0 | NO | OTROS TERRITORIOS | ||
201 | Aruba | X | 0.0 | NO | Aruba (Países Bajos) | |
202 | Bonaire | X | 0.0 | NO | Bonaire (Países Bajos) | |
203 | Curazao | X | 0.0 | NO | Curazao (Países Bajos) | |
204 | Guadalupe | X | 0.0 | NO | Guadalupe (Francia) | |
205 | Guyana Francesa | X | 0.0 | NO | Guyana Francesa | |
206 | Hong Kong | X Por 90 días | 0.0 | NO | Hong Kong (SARG-China) | |
207 | Macao | X Visa a la llegada | 1.0 | YES | Macao (SARG-China) (*) | |
208 | Martinica | X | 0.0 | NO | Martinica (Francia) | |
209 | Mayotte | X | 0.0 | NO | Mayotte (Francia) | |
210 | Nueva Caledonia | X | 0.0 | NO | Nueva Caledonia (Francia) | |
211 | Palestina | X | 1.0 | YES | Palestina | |
212 | Polinesia Francesa | X | 0.0 | NO | Polinesia Francesa | |
213 | Réunion | X | 0.0 | NO | Réunion (Francia) | |
214 | Saba | X | 0.0 | NO | Saba (Países Bajos) | |
215 | Saint Barthélémy | X | 1.0 | YES | Saint Barthélémy (Francia) | |
216 | Saint Pïerre et Miquelon | X | 0.0 | NO | Saint Pïerre et Miquelon (Francia) | |
217 | Saint Martin | X | 1.0 | YES | Saint Martin (Francia) | |
218 | Sint Maarten | X | 0.0 | NO | Sint Maarten (Países Bajos) | |
219 | Sint Eustatius | X | 0.0 | NO | Sint Eustatius (Países Bajos) | |
220 | Taiwan | X Visa electrónica | 1.0 | YES | Taiwan | |
221 | Wallis y Futuna | X | 0.0 | NO | Wallis y Futuna (Francia) | |
222 | 0.0 | NO | ||||
223 | Actualización 21 -10-2019 | 0.0 | NO | Actualización 21 -10-2019 | ||
224 | El presente cuadro presenta generalidades sobr... | 0.0 | NO | El presente cuadro presenta generalidades sobr... |
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChloropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)
col_visa.shape
(170, 175)
Ok, that helped a bit. Let's see what else is different. Let's start by finding which countries are not linked.
miss_countries = list(set(countries.NAME_ES).difference(col_visa.NAME_ES))
#miss_countries.remove(None)
#miss_countries.sort()
miss_visadf = list(set(visadf.PAIS).difference(col_visa.PAIS))
miss_visadf.remove('')
miss_visadf.sort()
print('Misssing countries', miss_countries)
print('')
print('Missing PAIS', miss_visadf)
Misssing countries ['Bajo de Masinloc', 'Bangladés', 'Islas Pitcairn', 'San Pedro y Miquelón', 'islas del mar del Coral', 'Birmania', 'Papúa Nueva Guinea', 'Islas Turcas y Caicos', 'Irak', 'Sahara Occidental', 'Brunéi', 'Samoa Estadounidense', 'Isla Santa Elena', 'Ciudad del Vaticano', 'Base Naval de la Bahía de Guantánamo', 'Estados Unidos', 'Islas Heard y McDonald', 'Islas Malvinas', 'San Cristóbal y Nieves', 'Fiyi', 'Islas Cook', 'Baréin', 'Benín', 'Puerto Rico', 'Tierras Australes y Antárticas Francesas', 'Isla de Man', 'Macedonia del Norte', 'República Democrática del Congo', 'Islas Georgias del Sur y Sandwich del Sur', 'Islas Caimán', 'Moldavia', 'Corea del Norte', 'Groenlandia', 'Antártida', 'Bir Tawil', 'Montserrat', 'República Turca del Norte de Chipre', 'Somalilandia', 'Bajo Nuevo', 'Dekelia', 'República de China', 'Guernsey', 'Baikonur', 'Islas Ashmore y Cartier', 'Laos', 'Bielorrusia', 'Islas Feroe', 'Colombia', 'Islas Vírgenes de los Estados Unidos', 'Anguila', 'Rusia', 'Reino Unido', 'Isla Wake', 'Estados Federados de Micronesia', 'Isla Clipperton', 'Gibraltar', 'Campo de hielo Patagónico Sur', 'República del Congo', 'Akrotiri', 'Guinea-Bisáu', 'Åland', 'Lesoto', 'Kazajistán', 'Isla Serranilla', 'San Martín', 'Bermudas', 'Islas Marianas del Norte', 'Malaui', 'República Checa', 'Jersey', 'Yibuti', 'Islas Vírgenes Británicas', 'Zimbabue', 'Territorio Británico del Océano Índico', 'Niue', 'China', 'Catar', 'Palaos', 'Isla Brasilera', 'Territorios Australianos del Océano Índico', 'San Bartolomé', 'Islas ultramarinas de Estados Unidos', 'Glaciar de Siachen', 'Isla Norfolk', 'Corea del Sur', 'Línea Verde', 'Guam'] Missing PAIS ['Actualización 21 -10-2019', 'Bahréin', 'Bangladesh', 'Belarús', 'Benin', 'Bonaire', 'Brunei Darussalam', 'Checa República', 'China República Popular', 'Congo', 'Congo República Democrática', 'Corea República', 'Corea República Popular Dem.', 'Djibouti', 'El presente cuadro presenta generalidades sobre la política de visas de otros países y no compromete la responsabilidad del Ministerio de Relaciones Exteriores. Se \nrecomienda dirigirse directamente a la Oficina Consular del país o territorio de su interés para obtener mayor información sobre turismo, visitas e inmigración.', 'Estados Unidos de América', 'Fiji', 'Guadalupe', 'Guinea-Bissau', 'Guyana Francesa', 'Iraq', 'Kazajstán', 'Laos República Democrática P', 'Lesotho', 'Macedonia', 'Malawi', 'Martinica', 'Mayotte', 'Micronesia', 'Moldova', 'Myanmar', 'OTROS TERRITORIOS', 'Palau', 'Papua Nueva Guinea', 'Qatar', 'Reino Unido Gran Bretaña e Irlanda del Norte', 'Rusia Federación', 'Réunion', 'Saba', 'Saint Barthélémy', 'Saint Kitts y Nevis', 'Saint Martin', 'Saint Pïerre et Miquelon', 'Santa Sede', 'Sint Eustatius', 'Sint Maarten', 'Taiwan', 'Zimbabwe']
Let's choose one example to see why/how they differ
countries.loc[countries.NAME_ES.str.find('Congo')!=-1, 'NAME_ES']
30 República del Congo 31 República Democrática del Congo Name: NAME_ES, dtype: object
visadf.loc[visadf.PAIS.str.find('Congo')!=-1, 'PAIS']
46 Congo 47 Congo República Democrática Name: PAIS, dtype: object
OK, so not an easy fix. We can correct by hand the missing ones or perhaps if we can find a way of linking for each missing country in one dataframe the most similar country in the other we may be able to simplify our work. If you google for help you will find e.g., that the package difflib
can help.
# Import package to match text
import difflib
Let's create a dataframe to keep the matches we create between the country name in countries
and visadf
.
matches = pd.DataFrame(miss_countries, columns=['countries'])
matches = matches.loc[matches.countries.isna()==False].reset_index(drop=True).copy()
matches
countries | |
---|---|
0 | Bajo de Masinloc |
1 | Bangladés |
2 | Islas Pitcairn |
3 | San Pedro y Miquelón |
4 | islas del mar del Coral |
... | ... |
82 | Glaciar de Siachen |
83 | Isla Norfolk |
84 | Corea del Sur |
85 | Línea Verde |
86 | Guam |
87 rows × 1 columns
Now, let's use the difflib.get_close_matches
function to find the closest match to each country name in countries
to visadf
.
matches['visadf'] = matches.countries.apply(lambda x: difflib.get_close_matches(x, miss_visadf, cutoff=0.8))
matches.loc[matches.visadf.apply(lambda x: x!=[])]
countries | visadf | |
---|---|---|
1 | Bangladés | [Bangladesh] |
6 | Papúa Nueva Guinea | [Papua Nueva Guinea] |
21 | Baréin | [Bahréin] |
22 | Benín | [Benin] |
30 | Moldavia | [Moldova] |
59 | Guinea-Bisáu | [Guinea-Bissau] |
61 | Lesoto | [Lesotho] |
62 | Kazajistán | [Kazajstán] |
64 | San Martín | [Saint Martin] |
67 | Malaui | [Malawi] |
72 | Zimbabue | [Zimbabwe] |
76 | Catar | [Qatar] |
So it works! Of course now we need to improve matches and try to find as many as we can so we do not have to do it by hand. One way to do it is to keep the correct matches and decrease the cutoff required for a match.
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'k'] = 0.8
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'visadf_matched'] = matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'visadf'].apply(lambda x: x[0])
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1)]
countries | visadf | k | visadf_matched | |
---|---|---|---|---|
1 | Bangladés | [Bangladesh] | 0.8 | Bangladesh |
6 | Papúa Nueva Guinea | [Papua Nueva Guinea] | 0.8 | Papua Nueva Guinea |
21 | Baréin | [Bahréin] | 0.8 | Bahréin |
22 | Benín | [Benin] | 0.8 | Benin |
30 | Moldavia | [Moldova] | 0.8 | Moldova |
59 | Guinea-Bisáu | [Guinea-Bissau] | 0.8 | Guinea-Bissau |
61 | Lesoto | [Lesotho] | 0.8 | Lesotho |
62 | Kazajistán | [Kazajstán] | 0.8 | Kazajstán |
64 | San Martín | [Saint Martin] | 0.8 | Saint Martin |
67 | Malaui | [Malawi] | 0.8 | Malawi |
72 | Zimbabue | [Zimbabwe] | 0.8 | Zimbabwe |
76 | Catar | [Qatar] | 0.8 | Qatar |
for k in np.arange(0.9,0.1,-0.025):
if matches.visadf_matched.isna().sum()!=0:
print(k)
matches['visadf'] = matches.countries.apply(lambda x: difflib.get_close_matches(x, miss_visadf, cutoff=k))
matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'k'] = k
matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'visadf_matched'] = matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'visadf'].apply(lambda x: x[0])
matches
0.9 0.875 0.85 0.825 0.7999999999999999 0.7749999999999999 0.7499999999999999 0.7249999999999999 0.6999999999999998 0.6749999999999998 0.6499999999999998 0.6249999999999998 0.5999999999999998 0.5749999999999997 0.5499999999999997 0.5249999999999997 0.49999999999999967 0.47499999999999964 0.4499999999999996 0.4249999999999996 0.3999999999999996 0.37499999999999956 0.34999999999999953 0.3249999999999995 0.2999999999999995 0.27499999999999947 0.24999999999999944 0.22499999999999942 0.1999999999999994 0.17499999999999938 0.14999999999999936 0.12499999999999933
countries | visadf | k | visadf_matched | |
---|---|---|---|---|
0 | Bajo de Masinloc | [Saint Martin, Martinica, Estados Unidos de Am... | 0.425 | Saint Martin |
1 | Bangladés | [Bangladesh, Belarús, Palau] | 0.800 | Bangladesh |
2 | Islas Pitcairn | [Qatar, Estados Unidos de América, Martinica] | 0.400 | Qatar |
3 | San Pedro y Miquelón | [Saint Pïerre et Miquelon, Santa Sede, Saint M... | 0.725 | Saint Pïerre et Miquelon |
4 | islas del mar del Coral | [Laos República Democrática P, Estados Unidos ... | 0.425 | Laos República Democrática P |
... | ... | ... | ... | ... |
82 | Glaciar de Siachen | [Saint Martin, Estados Unidos de América, Guad... | 0.400 | Saint Martin |
83 | Isla Norfolk | [Lesotho, Belarús, Laos República Democrática P] | NaN | NaN |
84 | Corea del Sur | [Corea República, Corea República Popular Dem.... | 0.550 | Corea República |
85 | Línea Verde | [Santa Sede, Bonaire, Guyana Francesa] | 0.550 | Santa Sede |
86 | Guam | [Guadalupe, Myanmar, Guinea-Bissau] | 0.450 | Guadalupe |
87 rows × 4 columns
matches.sort_values('k', ascending=False)
countries | visadf | k | visadf_matched | |
---|---|---|---|---|
72 | Zimbabue | [Zimbabwe, Djibouti, Saba] | 0.8 | Zimbabwe |
21 | Baréin | [Bahréin, Benin, Martinica] | 0.8 | Bahréin |
22 | Benín | [Benin, Réunion, Bonaire] | 0.8 | Benin |
67 | Malaui | [Malawi, Palau, Myanmar] | 0.8 | Malawi |
64 | San Martín | [Saint Martin, Sint Maarten, Saint Barthélémy] | 0.8 | Saint Martin |
... | ... | ... | ... | ... |
57 | República del Congo | [Corea República, Checa República, China Repúb... | NaN | NaN |
60 | Åland | [Santa Sede, Palau, Bangladesh] | NaN | NaN |
68 | República Checa | [Congo República Democrática, Laos República D... | NaN | NaN |
73 | Territorio Británico del Océano Índico | [Estados Unidos de América, Reino Unido Gran B... | NaN | NaN |
83 | Isla Norfolk | [Lesotho, Belarús, Laos República Democrática P] | NaN | NaN |
87 rows × 4 columns
Let's create the opposite match
matches2 = pd.DataFrame(miss_visadf, columns=['visadf'])
matches2 = matches2.loc[matches2.visadf.isna()==False].reset_index(drop=True).copy()
matches2['countries'] = matches2.visadf.apply(lambda x: difflib.get_close_matches(x, miss_countries, cutoff=0.9))
matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'k'] = 0.8
matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'countries_matched'] = matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'countries'].apply(lambda x: x[0])
for k in np.arange(0.9,0.1,-0.025):
if matches2.countries_matched.isna().sum()!=0:
print(k)
matches2['countries'] = matches2.visadf.apply(lambda x: difflib.get_close_matches(x, miss_countries, cutoff=k))
matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'k'] = k
matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'countries_matched'] = matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'countries'].apply(lambda x: x[0])
matches2
0.9 0.875 0.85 0.825 0.7999999999999999 0.7749999999999999 0.7499999999999999 0.7249999999999999 0.6999999999999998 0.6749999999999998 0.6499999999999998 0.6249999999999998 0.5999999999999998 0.5749999999999997 0.5499999999999997 0.5249999999999997 0.49999999999999967 0.47499999999999964 0.4499999999999996 0.4249999999999996 0.3999999999999996 0.37499999999999956 0.34999999999999953 0.3249999999999995 0.2999999999999995 0.27499999999999947 0.24999999999999944 0.22499999999999942 0.1999999999999994 0.17499999999999938 0.14999999999999936 0.12499999999999933
visadf | countries | k | countries_matched | |
---|---|---|---|---|
0 | Actualización 21 -10-2019 | [Kazajistán, Glaciar de Siachen, Somalilandia] | NaN | NaN |
1 | Bahréin | [Baréin, Brunéi, Baikonur] | 0.800 | Baréin |
2 | Bangladesh | [Bangladés, Palaos, Anguila] | 0.825 | Bangladés |
3 | Belarús | [Bielorrusia, Bermudas, Bangladés] | 0.550 | Bielorrusia |
4 | Benin | [Benín, Brunéi, Baréin] | 0.800 | Benín |
5 | Bonaire | [Baikonur, Baréin, Moldavia] | 0.525 | Baikonur |
6 | Brunei Darussalam | [Brunéi, Bielorrusia, San Bartolomé] | NaN | NaN |
7 | Checa República | [República Checa, República de China, Repúblic... | 0.600 | República Checa |
8 | China República Popular | [República Checa, República de China, Repúblic... | 0.575 | República Checa |
9 | Congo | [Colombia, República del Congo, Islas Cook] | 0.450 | Colombia |
10 | Congo República Democrática | [República Democrática del Congo, República Ch... | 0.700 | República Democrática del Congo |
11 | Corea República | [República Checa, Corea del Sur, República de ... | 0.600 | República Checa |
12 | Corea República Popular Dem. | [Corea del Norte, Corea del Sur, República Checa] | 0.550 | Corea del Norte |
13 | Djibouti | [Yibuti, Bielorrusia, Zimbabue] | 0.700 | Yibuti |
14 | El presente cuadro presenta generalidades sobr... | [] | NaN | NaN |
15 | Estados Unidos de América | [Estados Unidos, Estados Federados de Micrones... | 0.700 | Estados Unidos |
16 | Fiji | [Fiyi, Yibuti, Birmania] | 0.750 | Fiyi |
17 | Guadalupe | [Guam, Malaui, Zimbabue] | 0.450 | Guam |
18 | Guinea-Bissau | [Guinea-Bisáu, Groenlandia, Papúa Nueva Guinea] | 0.875 | Guinea-Bisáu |
19 | Guyana Francesa | [Sahara Occidental, Isla Brasilera, Tierras Au... | 0.425 | Sahara Occidental |
20 | Iraq | [Irak, Isla Brasilera, Birmania] | 0.750 | Irak |
21 | Kazajstán | [Kazajistán, Palaos, Samoa Estadounidense] | 0.800 | Kazajistán |
22 | Laos República Democrática P | [República Democrática del Congo, República Ch... | 0.725 | República Democrática del Congo |
23 | Lesotho | [Lesoto, Laos, Islas Cook] | 0.800 | Lesoto |
24 | Macedonia | [Macedonia del Norte, Moldavia, Birmania] | 0.625 | Macedonia del Norte |
25 | Malawi | [Malaui, Moldavia, Palaos] | 0.825 | Malaui |
26 | Martinica | [Baréin, San Martín, Moldavia] | NaN | NaN |
27 | Mayotte | [Montserrat, Macedonia del Norte, Laos] | 0.450 | Montserrat |
28 | Micronesia | [Birmania, Montserrat, Estados Federados de Mi... | 0.550 | Birmania |
29 | Moldova | [Moldavia, Colombia, Montserrat] | 0.650 | Moldavia |
30 | Myanmar | [Catar, San Martín, Malaui] | 0.500 | Catar |
31 | OTROS TERRITORIOS | [Sahara Occidental, Samoa Estadounidense, Bir ... | 0.175 | Sahara Occidental |
32 | Palau | [Palaos, Malaui, Gibraltar] | NaN | NaN |
33 | Papua Nueva Guinea | [Papúa Nueva Guinea, República de China, Bajo ... | 0.800 | Papúa Nueva Guinea |
34 | Qatar | [Catar, Gibraltar, Islas Pitcairn] | 0.800 | Catar |
35 | Reino Unido Gran Bretaña e Irlanda del Norte | [Macedonia del Norte, Corea del Norte, Repúbli... | NaN | NaN |
36 | Rusia Federación | [Isla de Man, Rusia, República de China] | 0.500 | Isla de Man |
37 | Réunion | [Rusia, Brunéi, Baréin] | 0.500 | Rusia |
38 | Saba | [Catar, San Martín, Palaos] | NaN | NaN |
39 | Saint Barthélémy | [San Bartolomé, San Martín, San Cristóbal y Ni... | 0.675 | San Bartolomé |
40 | Saint Kitts y Nevis | [San Cristóbal y Nieves, San Pedro y Miquelón,... | 0.625 | San Cristóbal y Nieves |
41 | Saint Martin | [San Martín, San Bartolomé, Antártida] | 0.800 | San Martín |
42 | Saint Pïerre et Miquelon | [San Pedro y Miquelón, San Bartolomé, Estados ... | 0.725 | San Pedro y Miquelón |
43 | Santa Sede | [Línea Verde, Isla Santa Elena, San Pedro y Mi... | 0.550 | Línea Verde |
44 | Sint Eustatius | [Samoa Estadounidense, San Cristóbal y Nieves,... | 0.450 | Samoa Estadounidense |
45 | Sint Maarten | [San Martín, Isla Santa Elena, Sahara Occidental] | 0.725 | San Martín |
46 | Taiwan | [Baréin, Somalilandia, Moldavia] | 0.500 | Baréin |
47 | Zimbabwe | [Zimbabue, Colombia, Birmania] | 0.875 | Zimbabue |
Clearly, this still will need some work...some were linked correctly, others not (although the correct ones seem to be in the list countries
) and still there are some that do not seem to be there at all! This is partly due to the fact that the Natural Earth shapefile does not seem to have some countries (e.g., Bonaire, Sint Eustatius, Saba, Reunion). Given the missing locations in countries
it may be easier to use matches2
to finish the matching.
namecols = ['SOVEREIGNT', 'NAME_ES'] + [col for col in countries.columns if col.find('NAME')!=-1]
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('bonaire').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT |
---|
0 rows × 35 columns
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('eust').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT |
---|
0 rows × 35 columns
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('sab').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT |
---|
0 rows × 35 columns
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('reun').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT |
---|
0 rows × 35 columns
Or have different writing/names (e.g., Myanmar, Swaziland) or because the Spanish name used by the Colombian Cancillery is non-standard (e.g., Santa Sede vs Vaticano)
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('myan').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
126 | Myanmar | Birmania | Myanmar | Myanmar | Myanmar | Burma | Myanmar | None | 7 | ميانمار | ... | Mjanma | Myanmar | Мьянма | Myanmar | Myanmar | М'янма | میانمار | Myanma | 缅甸 | 緬甸 |
1 rows × 35 columns
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('swazi').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
123 | eSwatini | Suazilandia | eSwatini | Kingdom of eSwatini | eSwatini | eSwatini | eSwatini | Swaziland | 8 | إسواتيني | ... | Eswatini | Essuatíni | Эсватини | Swaziland | Esvatini | Есватіні | اسواتینی | Eswatini | 斯威士兰 | 史瓦帝尼 |
1 rows × 35 columns
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('vatic').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
167 | Vatican | Ciudad del Vaticano | Vatican | Vatican | Vatican | Holy See (Vatican City) | Vatican (Holy See) | Holy See | 7 | الفاتيكان | ... | Watykan | Vaticano | Ватикан | Vatikanstaten | Vatikan | Ватикан | ویٹیکن سٹی | Thành Vatican | 梵蒂冈 | 梵蒂岡 |
1 rows × 35 columns
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('china').any(), axis=1), namecols]
SOVEREIGNT | NAME_ES | NAME | NAME_LONG | BRK_NAME | NAME_CIAWF | NAME_SORT | NAME_ALT | NAME_LEN | NAME_AR | ... | NAME_PL | NAME_PT | NAME_RU | NAME_SV | NAME_TR | NAME_UK | NAME_UR | NAME_VI | NAME_ZH | NAME_ZHT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | China | China | China | China | China | China | China | None | 5 | الصين | ... | Chińska Republika Ludowa | China | Китайская Народная Республика | Kina | Çin Halk Cumhuriyeti | Китайська Народна Республіка | عوامی جمہوریہ چین | Trung Quốc | 中华人民共和国 | 中華人民共和國 |
166 | China | Hong Kong | Hong Kong | Hong Kong | Hong Kong | Hong Kong | Hong Kong SAR, China | None | 9 | هونغ كونغ | ... | Hongkong | Hong Kong | Гонконг | Hongkong | Hong Kong | Гонконг | ہانگ کانگ | Hồng Kông | 香港 | 香港 |
188 | Taiwan | República de China | Taiwan | Taiwan | Taiwan | Taiwan | Taiwan | None | 6 | تايوان | ... | Republika Chińska | Taiwan | Тайвань | Taiwan | Çin Cumhuriyeti | Республіка Китай | تائیوان | Đài Loan | 中华民国 | 中華民國 |
251 | Spratly Islands | Isla Wake | Spratly Is. | Spratly Islands | Spratly Is. | None | Spratly Islands | None | 11 | جزيرة ويك | ... | Wake | Ilha Wake | Уэйк | Wake | Wake Adası | Вейк | جزیرہ ویک | Đảo Wake | 威克岛 | 威克島 |
253 | China | Macao | Macao | Macao | Macao | Macau | Macao SAR, China | None | 5 | ماكاو | ... | Makau | Macau | Макао | Macao | Makao | Аоминь | مکاؤ | Ma Cao | 澳门 | 澳門 |
5 rows × 35 columns
Let's try to see how goodf the best macthes are
matches2.sort_values('k', ascending=False)
visadf | countries | k | countries_matched | |
---|---|---|---|---|
47 | Zimbabwe | [Zimbabue, Colombia, Birmania] | 0.875 | Zimbabue |
18 | Guinea-Bissau | [Guinea-Bisáu, Groenlandia, Papúa Nueva Guinea] | 0.875 | Guinea-Bisáu |
25 | Malawi | [Malaui, Moldavia, Palaos] | 0.825 | Malaui |
2 | Bangladesh | [Bangladés, Palaos, Anguila] | 0.825 | Bangladés |
33 | Papua Nueva Guinea | [Papúa Nueva Guinea, República de China, Bajo ... | 0.800 | Papúa Nueva Guinea |
21 | Kazajstán | [Kazajistán, Palaos, Samoa Estadounidense] | 0.800 | Kazajistán |
1 | Bahréin | [Baréin, Brunéi, Baikonur] | 0.800 | Baréin |
23 | Lesotho | [Lesoto, Laos, Islas Cook] | 0.800 | Lesoto |
41 | Saint Martin | [San Martín, San Bartolomé, Antártida] | 0.800 | San Martín |
4 | Benin | [Benín, Brunéi, Baréin] | 0.800 | Benín |
34 | Qatar | [Catar, Gibraltar, Islas Pitcairn] | 0.800 | Catar |
20 | Iraq | [Irak, Isla Brasilera, Birmania] | 0.750 | Irak |
16 | Fiji | [Fiyi, Yibuti, Birmania] | 0.750 | Fiyi |
42 | Saint Pïerre et Miquelon | [San Pedro y Miquelón, San Bartolomé, Estados ... | 0.725 | San Pedro y Miquelón |
45 | Sint Maarten | [San Martín, Isla Santa Elena, Sahara Occidental] | 0.725 | San Martín |
22 | Laos República Democrática P | [República Democrática del Congo, República Ch... | 0.725 | República Democrática del Congo |
15 | Estados Unidos de América | [Estados Unidos, Estados Federados de Micrones... | 0.700 | Estados Unidos |
10 | Congo República Democrática | [República Democrática del Congo, República Ch... | 0.700 | República Democrática del Congo |
13 | Djibouti | [Yibuti, Bielorrusia, Zimbabue] | 0.700 | Yibuti |
39 | Saint Barthélémy | [San Bartolomé, San Martín, San Cristóbal y Ni... | 0.675 | San Bartolomé |
29 | Moldova | [Moldavia, Colombia, Montserrat] | 0.650 | Moldavia |
24 | Macedonia | [Macedonia del Norte, Moldavia, Birmania] | 0.625 | Macedonia del Norte |
40 | Saint Kitts y Nevis | [San Cristóbal y Nieves, San Pedro y Miquelón,... | 0.625 | San Cristóbal y Nieves |
7 | Checa República | [República Checa, República de China, Repúblic... | 0.600 | República Checa |
11 | Corea República | [República Checa, Corea del Sur, República de ... | 0.600 | República Checa |
8 | China República Popular | [República Checa, República de China, Repúblic... | 0.575 | República Checa |
3 | Belarús | [Bielorrusia, Bermudas, Bangladés] | 0.550 | Bielorrusia |
43 | Santa Sede | [Línea Verde, Isla Santa Elena, San Pedro y Mi... | 0.550 | Línea Verde |
28 | Micronesia | [Birmania, Montserrat, Estados Federados de Mi... | 0.550 | Birmania |
12 | Corea República Popular Dem. | [Corea del Norte, Corea del Sur, República Checa] | 0.550 | Corea del Norte |
5 | Bonaire | [Baikonur, Baréin, Moldavia] | 0.525 | Baikonur |
36 | Rusia Federación | [Isla de Man, Rusia, República de China] | 0.500 | Isla de Man |
37 | Réunion | [Rusia, Brunéi, Baréin] | 0.500 | Rusia |
30 | Myanmar | [Catar, San Martín, Malaui] | 0.500 | Catar |
46 | Taiwan | [Baréin, Somalilandia, Moldavia] | 0.500 | Baréin |
27 | Mayotte | [Montserrat, Macedonia del Norte, Laos] | 0.450 | Montserrat |
17 | Guadalupe | [Guam, Malaui, Zimbabue] | 0.450 | Guam |
44 | Sint Eustatius | [Samoa Estadounidense, San Cristóbal y Nieves,... | 0.450 | Samoa Estadounidense |
9 | Congo | [Colombia, República del Congo, Islas Cook] | 0.450 | Colombia |
19 | Guyana Francesa | [Sahara Occidental, Isla Brasilera, Tierras Au... | 0.425 | Sahara Occidental |
31 | OTROS TERRITORIOS | [Sahara Occidental, Samoa Estadounidense, Bir ... | 0.175 | Sahara Occidental |
0 | Actualización 21 -10-2019 | [Kazajistán, Glaciar de Siachen, Somalilandia] | NaN | NaN |
6 | Brunei Darussalam | [Brunéi, Bielorrusia, San Bartolomé] | NaN | NaN |
14 | El presente cuadro presenta generalidades sobr... | [] | NaN | NaN |
26 | Martinica | [Baréin, San Martín, Moldavia] | NaN | NaN |
32 | Palau | [Palaos, Malaui, Gibraltar] | NaN | NaN |
35 | Reino Unido Gran Bretaña e Irlanda del Norte | [Macedonia del Norte, Corea del Norte, Repúbli... | NaN | NaN |
38 | Saba | [Catar, San Martín, Palaos] | NaN | NaN |
Seems we won't be able to improve, so let's finish by hand (using code of course, since we want replicability of our results).
# Correct matches2
matches2.loc[matches2.visadf=='Suazilandia', 'countries_matched'] = 'eSwatini'
matches2.loc[matches2.visadf=='Laos República Democrática P', 'countries_matched'] = 'Laos'
matches2.loc[matches2.visadf=='Corea República Popular Dem.', 'countries_matched'] = 'Corea del Norte'
matches2.loc[matches2.visadf=='Corea República', 'countries_matched'] = 'Corea del Sur'
matches2.loc[matches2.visadf=='Martinica', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Santa Sede', 'countries_matched'] = 'Ciudad del Vaticano'
matches2.loc[matches2.visadf=='Bonaire', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Myanmar', 'countries_matched'] = 'Birmania'
matches2.loc[matches2.visadf=='Rusia Federación', 'countries_matched'] = 'Rusia'
matches2.loc[matches2.visadf=='Réunion', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Mayotte', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Reino Unido Gran Bretaña e Irlanda del Norte', 'countries_matched'] = 'Reino Unido'
matches2.loc[matches2.visadf=='Congo', 'countries_matched'] = 'República del Congo'
matches2.loc[matches2.visadf=='Sint Eustatius', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Guadalupe', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Guyana Francesa', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Brunei Darussalam', 'countries_matched'] = 'Brunéi'
matches2.loc[matches2.visadf=='Palau', 'countries_matched'] = 'Palaos'
matches2.loc[matches2.visadf=='Saba', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='China República Popular', 'countries_matched'] = 'China'
#matches2.loc[matches2.visadf=='', 'countries_matched'] = ''
#matches2.loc[matches2.visadf=='', 'countries_matched'] = ''
matches2.sort_values('k', ascending=False)
visadf | countries | k | countries_matched | |
---|---|---|---|---|
47 | Zimbabwe | [Zimbabue, Colombia, Birmania] | 0.875 | Zimbabue |
18 | Guinea-Bissau | [Guinea-Bisáu, Groenlandia, Papúa Nueva Guinea] | 0.875 | Guinea-Bisáu |
25 | Malawi | [Malaui, Moldavia, Palaos] | 0.825 | Malaui |
2 | Bangladesh | [Bangladés, Palaos, Anguila] | 0.825 | Bangladés |
33 | Papua Nueva Guinea | [Papúa Nueva Guinea, República de China, Bajo ... | 0.800 | Papúa Nueva Guinea |
21 | Kazajstán | [Kazajistán, Palaos, Samoa Estadounidense] | 0.800 | Kazajistán |
1 | Bahréin | [Baréin, Brunéi, Baikonur] | 0.800 | Baréin |
23 | Lesotho | [Lesoto, Laos, Islas Cook] | 0.800 | Lesoto |
41 | Saint Martin | [San Martín, San Bartolomé, Antártida] | 0.800 | San Martín |
4 | Benin | [Benín, Brunéi, Baréin] | 0.800 | Benín |
34 | Qatar | [Catar, Gibraltar, Islas Pitcairn] | 0.800 | Catar |
20 | Iraq | [Irak, Isla Brasilera, Birmania] | 0.750 | Irak |
16 | Fiji | [Fiyi, Yibuti, Birmania] | 0.750 | Fiyi |
42 | Saint Pïerre et Miquelon | [San Pedro y Miquelón, San Bartolomé, Estados ... | 0.725 | San Pedro y Miquelón |
45 | Sint Maarten | [San Martín, Isla Santa Elena, Sahara Occidental] | 0.725 | San Martín |
22 | Laos República Democrática P | [República Democrática del Congo, República Ch... | 0.725 | Laos |
15 | Estados Unidos de América | [Estados Unidos, Estados Federados de Micrones... | 0.700 | Estados Unidos |
10 | Congo República Democrática | [República Democrática del Congo, República Ch... | 0.700 | República Democrática del Congo |
13 | Djibouti | [Yibuti, Bielorrusia, Zimbabue] | 0.700 | Yibuti |
39 | Saint Barthélémy | [San Bartolomé, San Martín, San Cristóbal y Ni... | 0.675 | San Bartolomé |
29 | Moldova | [Moldavia, Colombia, Montserrat] | 0.650 | Moldavia |
24 | Macedonia | [Macedonia del Norte, Moldavia, Birmania] | 0.625 | Macedonia del Norte |
40 | Saint Kitts y Nevis | [San Cristóbal y Nieves, San Pedro y Miquelón,... | 0.625 | San Cristóbal y Nieves |
7 | Checa República | [República Checa, República de China, Repúblic... | 0.600 | República Checa |
11 | Corea República | [República Checa, Corea del Sur, República de ... | 0.600 | Corea del Sur |
8 | China República Popular | [República Checa, República de China, Repúblic... | 0.575 | China |
3 | Belarús | [Bielorrusia, Bermudas, Bangladés] | 0.550 | Bielorrusia |
43 | Santa Sede | [Línea Verde, Isla Santa Elena, San Pedro y Mi... | 0.550 | Ciudad del Vaticano |
28 | Micronesia | [Birmania, Montserrat, Estados Federados de Mi... | 0.550 | Birmania |
12 | Corea República Popular Dem. | [Corea del Norte, Corea del Sur, República Checa] | 0.550 | Corea del Norte |
5 | Bonaire | [Baikonur, Baréin, Moldavia] | 0.525 | |
36 | Rusia Federación | [Isla de Man, Rusia, República de China] | 0.500 | Rusia |
37 | Réunion | [Rusia, Brunéi, Baréin] | 0.500 | |
30 | Myanmar | [Catar, San Martín, Malaui] | 0.500 | Birmania |
46 | Taiwan | [Baréin, Somalilandia, Moldavia] | 0.500 | Baréin |
27 | Mayotte | [Montserrat, Macedonia del Norte, Laos] | 0.450 | |
17 | Guadalupe | [Guam, Malaui, Zimbabue] | 0.450 | |
44 | Sint Eustatius | [Samoa Estadounidense, San Cristóbal y Nieves,... | 0.450 | |
9 | Congo | [Colombia, República del Congo, Islas Cook] | 0.450 | República del Congo |
19 | Guyana Francesa | [Sahara Occidental, Isla Brasilera, Tierras Au... | 0.425 | |
31 | OTROS TERRITORIOS | [Sahara Occidental, Samoa Estadounidense, Bir ... | 0.175 | Sahara Occidental |
0 | Actualización 21 -10-2019 | [Kazajistán, Glaciar de Siachen, Somalilandia] | NaN | NaN |
6 | Brunei Darussalam | [Brunéi, Bielorrusia, San Bartolomé] | NaN | Brunéi |
14 | El presente cuadro presenta generalidades sobr... | [] | NaN | NaN |
26 | Martinica | [Baréin, San Martín, Moldavia] | NaN | |
32 | Palau | [Palaos, Malaui, Gibraltar] | NaN | Palaos |
35 | Reino Unido Gran Bretaña e Irlanda del Norte | [Macedonia del Norte, Corea del Norte, Repúbli... | NaN | Reino Unido |
38 | Saba | [Catar, San Martín, Palaos] | NaN |
#countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('aba').any(), axis=1), namecols]
This is as good as we can do it with this dataset and shapefile (of course we may need a different shapefile if we really need to ensure that we are plotting all the correct information. E.g., does French Guyana have the same visa requirements than France and the other French Territories represented in Natural Earth's shapefile as France? If so, then we are ok! Otherwise we would need another shapefile or transform this one).
visadf['countries_matched'] = visadf.PAIS
visadf.loc[visadf.PAIS.apply(lambda x: x in miss_visadf), 'countries_matched'] = visadf.loc[visadf.PAIS.apply(lambda x: x in miss_visadf)].PAIS.map(matches2[['visadf', 'countries_matched']].set_index('visadf').to_dict()['countries_matched'])
visadf
PAIS | SI | NO | visa_req | visa_req_YN | PAIS_OR | countries_matched | |
---|---|---|---|---|---|---|---|
6 | Afganistán | X | 1.0 | YES | Afganistán | Afganistán | |
7 | Albania | X | 0.0 | NO | Albania | Albania | |
8 | Alemania | X | 0.0 | NO | Alemania | Alemania | |
9 | Andorra | X | 0.0 | NO | Andorra | Andorra | |
10 | Angola | X | 1.0 | YES | Angola | Angola | |
... | ... | ... | ... | ... | ... | ... | ... |
220 | Taiwan | X Visa electrónica | 1.0 | YES | Taiwan | Baréin | |
221 | Wallis y Futuna | X | 0.0 | NO | Wallis y Futuna (Francia) | Wallis y Futuna | |
222 | 0.0 | NO | |||||
223 | Actualización 21 -10-2019 | 0.0 | NO | Actualización 21 -10-2019 | NaN | ||
224 | El presente cuadro presenta generalidades sobr... | 0.0 | NO | El presente cuadro presenta generalidades sobr... | NaN |
219 rows × 7 columns
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='countries_matched')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChloropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
save=False)
Let's check whether all Franch Territories depicted have the correct visa assignment.
col_visa.loc[col_visa.PAIS_OR.str.contains('Francia'), ['SOVEREIGNT', 'NAME_ES', 'ADM0_A3'] + visadf.columns.to_list()]
SOVEREIGNT | NAME_ES | ADM0_A3 | PAIS | SI | NO | visa_req | visa_req_YN | PAIS_OR | countries_matched | |
---|---|---|---|---|---|---|---|---|---|---|
19 | France | Francia | FRA | Francia | X | 0.0 | NO | Francia | Francia | |
35 | France | San Martín | MAF | Saint Martin | X | 1.0 | YES | Saint Martin (Francia) | San Martín | |
37 | Netherlands | San Martín | SXM | Saint Martin | X | 1.0 | YES | Saint Martin (Francia) | San Martín | |
168 | France | Nueva Caledonia | NCL | Nueva Caledonia | X | 0.0 | NO | Nueva Caledonia (Francia) | Nueva Caledonia | |
176 | France | San Pedro y Miquelón | SPM | Saint Pïerre et Miquelon | X | 0.0 | NO | Saint Pïerre et Miquelon (Francia) | San Pedro y Miquelón | |
190 | France | San Bartolomé | BLM | Saint Barthélémy | X | 1.0 | YES | Saint Barthélémy (Francia) | San Bartolomé | |
199 | France | Wallis y Futuna | WLF | Wallis y Futuna | X | 0.0 | NO | Wallis y Futuna (Francia) | Wallis y Futuna |
visadf.loc[visadf.PAIS_OR.str.contains('Francia')]
PAIS | SI | NO | visa_req | visa_req_YN | PAIS_OR | countries_matched | |
---|---|---|---|---|---|---|---|
71 | Francia | X | 0.0 | NO | Francia | Francia | |
204 | Guadalupe | X | 0.0 | NO | Guadalupe (Francia) | ||
208 | Martinica | X | 0.0 | NO | Martinica (Francia) | ||
209 | Mayotte | X | 0.0 | NO | Mayotte (Francia) | ||
210 | Nueva Caledonia | X | 0.0 | NO | Nueva Caledonia (Francia) | Nueva Caledonia | |
213 | Réunion | X | 0.0 | NO | Réunion (Francia) | ||
215 | Saint Barthélémy | X | 1.0 | YES | Saint Barthélémy (Francia) | San Bartolomé | |
216 | Saint Pïerre et Miquelon | X | 0.0 | NO | Saint Pïerre et Miquelon (Francia) | San Pedro y Miquelón | |
217 | Saint Martin | X | 1.0 | YES | Saint Martin (Francia) | San Martín | |
221 | Wallis y Futuna | X | 0.0 | NO | Wallis y Futuna (Francia) | Wallis y Futuna |
Seems the ones we are missing have the same equirements as mainland France, so we are lucky and do not seem to need to do more.
col_visa
data with data from the World Development indicators