import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
url = 'https://raw.githubusercontent.com/lorey/list-of-countries/master/csv/countries.csv'
df = pd.read_csv(url, sep=";") #index_col=0
print(df.head(5))
alpha_2 alpha_3 area capital continent currency_code \ 0 AD AND 468.0 Andorra la Vella EU EUR 1 AE ARE 82880.0 Abu Dhabi AS AED 2 AF AFG 647500.0 Kabul AS AFN 3 AG ATG 443.0 St. John's NaN XCD 4 AI AIA 102.0 The Valley NaN XCD currency_name eqivalent_fips_code fips geoname_id languages \ 0 Euro NaN AN 3041565 ca 1 Dirham NaN AE 290557 ar-AE,fa,en,hi,ur 2 Afghani NaN AF 1149361 fa-AF,ps,uz-AF,tk 3 Dollar NaN AC 3576396 en-AG 4 Dollar NaN AV 3573511 en-AI name neighbours numeric phone population \ 0 Andorra ES,FR 20 376 84000 1 United Arab Emirates SA,OM 784 971 4975593 2 Afghanistan TM,CN,IR,TJ,PK,UZ 4 93 29121286 3 Antigua and Barbuda NaN 28 +1-268 86754 4 Anguilla NaN 660 +1-264 13254 postal_code_format postal_code_regex tld 0 AD### ^(?:AD)*(\d{3})$ .ad 1 NaN NaN .ae 2 NaN NaN .af 3 NaN NaN .ag 4 NaN NaN .ai
print('Cantidad de Filas y columnas:',df.shape)
print('Nombre columnas:',df.columns)
Cantidad de Filas y columnas: (252, 19) Nombre columnas: Index(['alpha_2', 'alpha_3', 'area', 'capital', 'continent', 'currency_code', 'currency_name', 'eqivalent_fips_code', 'fips', 'geoname_id', 'languages', 'name', 'neighbours', 'numeric', 'phone', 'population', 'postal_code_format', 'postal_code_regex', 'tld'], dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 252 entries, 0 to 251 Data columns (total 19 columns): alpha_2 251 non-null object alpha_3 252 non-null object area 252 non-null float64 capital 246 non-null object continent 210 non-null object currency_code 251 non-null object currency_name 251 non-null object eqivalent_fips_code 1 non-null object fips 249 non-null object geoname_id 252 non-null int64 languages 249 non-null object name 252 non-null object neighbours 165 non-null object numeric 252 non-null int64 phone 247 non-null object population 252 non-null int64 postal_code_format 154 non-null object postal_code_regex 152 non-null object tld 250 non-null object dtypes: float64(1), int64(3), object(15) memory usage: 37.5+ KB
df.describe()
area | geoname_id | numeric | population | |
---|---|---|---|---|
count | 2.520000e+02 | 2.520000e+02 | 252.000000 | 2.520000e+02 |
mean | 5.952879e+05 | 2.427870e+06 | 434.309524 | 2.727679e+07 |
std | 1.904818e+06 | 1.632093e+06 | 254.663139 | 1.164127e+08 |
min | 0.000000e+00 | 4.951800e+04 | 0.000000 | 0.000000e+00 |
25% | 1.098000e+03 | 1.163774e+06 | 217.000000 | 1.879528e+05 |
50% | 6.489450e+04 | 2.367967e+06 | 436.000000 | 4.268583e+06 |
75% | 3.622245e+05 | 3.478296e+06 | 652.500000 | 1.536688e+07 |
max | 1.710000e+07 | 8.505033e+06 | 894.000000 | 1.330044e+09 |
corr = df.set_index('alpha_3').corr()
sm.graphics.plot_corr(corr, xnames=list(corr.columns))
plt.show()
url = 'https://raw.githubusercontent.com/DrueStaples/Population_Growth/master/countries.csv'
df_pop = pd.read_csv(url)
print(df_pop.head(5))
country year population 0 Afghanistan 1952 8425333 1 Afghanistan 1957 9240934 2 Afghanistan 1962 10267083 3 Afghanistan 1967 11537966 4 Afghanistan 1972 13079460
df_pop_es = df_pop[df_pop["country"] == 'Spain' ]
df_pop_es.head()
country | year | population | |
---|---|---|---|
1416 | Spain | 1952 | 28549870 |
1417 | Spain | 1957 | 29841614 |
1418 | Spain | 1962 | 31158061 |
1419 | Spain | 1967 | 32850275 |
1420 | Spain | 1972 | 34513161 |
df_pop_es.shape
(12, 3)
df_pop_es.drop(['country'],axis=1)['population'].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x12091aef0>
df_pop_ar = df_pop[(df_pop["country"] == 'Argentina')]
df_pop_ar.head()
country | year | population | |
---|---|---|---|
48 | Argentina | 1952 | 17876956 |
49 | Argentina | 1957 | 19610538 |
50 | Argentina | 1962 | 21283783 |
51 | Argentina | 1967 | 22934225 |
52 | Argentina | 1972 | 24779799 |
df_pop_ar.shape
(12, 3)
df_pop_ar.set_index('year').plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x123026c50>
anios = df_pop_es['year'].unique()
pop_ar = df_pop_ar['population'].values
pop_es = df_pop_es['population'].values
df_plot = pd.DataFrame({'Argentina': pop_ar,
'Spain': pop_es},
index=anios)
df_plot.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x11ef1d780>
df_espanol = df.replace(np.nan, '', regex=True)
df_espanol = df_espanol[ df_espanol['languages'].str.contains('es') ]
df_espanol
alpha_2 | alpha_3 | area | capital | continent | currency_code | currency_name | eqivalent_fips_code | fips | geoname_id | languages | name | neighbours | numeric | phone | population | postal_code_format | postal_code_regex | tld | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | AR | ARG | 2766890.0 | Buenos Aires | SA | ARS | Peso | AR | 3865483 | es-AR,en,it,de,fr,gn | Argentina | CL,BO,UY,PY,BR | 32 | 54 | 41343201 | @####@@@ | ^[A-Z]?\d{4}[A-Z]{0,3}$ | .ar | |
13 | AW | ABW | 193.0 | Oranjestad | AWG | Guilder | AA | 3577279 | nl-AW,es,en | Aruba | 533 | 297 | 71566 | .aw | |||||
28 | BO | BOL | 1098580.0 | Sucre | SA | BOB | Boliviano | BL | 3923057 | es-BO,qu,ay | Bolivia | PE,CL,PY,BR,AR | 68 | 591 | 9947418 | .bo | |||
30 | BR | BRA | 8511965.0 | Brasilia | SA | BRL | Real | BR | 3469034 | pt-BR,es,en,fr | Brazil | SR,PE,BO,UY,GY,PY,GF,VE,CO,AR | 76 | 55 | 201103330 | #####-### | ^\d{5}-\d{3}$ | .br | |
36 | BZ | BLZ | 22966.0 | Belmopan | BZD | Dollar | BH | 3582678 | en-BZ,es | Belize | GT,MX | 84 | 501 | 314522 | .bz | ||||
45 | CL | CHL | 756950.0 | Santiago | SA | CLP | Peso | CI | 3895114 | es-CL | Chile | PE,BO,AR | 152 | 56 | 16746491 | ####### | ^(\d{7})$ | .cl | |
48 | CO | COL | 1138910.0 | Bogota | SA | COP | Peso | CO | 3686110 | es-CO | Colombia | EC,PE,PA,BR,VE | 170 | 57 | 47790000 | .co | |||
49 | CR | CRI | 51100.0 | San Jose | CRC | Colon | CS | 3624060 | es-CR,en | Costa Rica | PA,NI | 188 | 506 | 4516220 | #### | ^(\d{4})$ | .cr | ||
50 | CU | CUB | 110860.0 | Havana | CUP | Peso | CU | 3562981 | es-CU | Cuba | US | 192 | 53 | 11423000 | CP ##### | ^(?:CP)*(\d{5})$ | .cu | ||
60 | DO | DOM | 48730.0 | Santo Domingo | DOP | Peso | DR | 3508796 | es-DO | Dominican Republic | HT | 214 | +1-809 and 1-829 | 9823821 | ##### | ^(\d{5})$ | .do | ||
62 | EC | ECU | 283560.0 | Quito | SA | USD | Dollar | EC | 3658394 | es-EC | Ecuador | PE,CO | 218 | 593 | 14790608 | @####@ | ^([a-zA-Z]\d{4}[a-zA-Z])$ | .ec | |
67 | ES | ESP | 504782.0 | Madrid | EU | EUR | Euro | SP | 2510769 | es-ES,ca,gl,eu,oc | Spain | AD,PT,GI,FR,MA | 724 | 34 | 46505963 | ##### | ^(\d{5})$ | .es | |
82 | GI | GIB | 6.5 | Gibraltar | EU | GIP | Pound | GI | 2411586 | en-GI,es,it,pt | Gibraltar | ES | 292 | 350 | 27884 | .gi | |||
87 | GQ | GNQ | 28051.0 | Malabo | AF | XAF | Franc | EK | 2309096 | es-GQ,fr | Equatorial Guinea | GA,CM | 226 | 240 | 1014999 | .gq | |||
90 | GT | GTM | 108890.0 | Guatemala City | GTQ | Quetzal | GT | 3595528 | es-GT | Guatemala | MX,HN,BZ,SV | 320 | 502 | 13550440 | ##### | ^(\d{5})$ | .gt | ||
96 | HN | HND | 112090.0 | Tegucigalpa | HNL | Lempira | HO | 3608932 | es-HN | Honduras | GT,NI,SV | 340 | 504 | 7989415 | @@#### | ^([A-Z]{2}\d{4})$ | .hn | ||
157 | MX | MEX | 1972550.0 | Mexico City | MXN | Peso | MX | 3996063 | es-MX | Mexico | GT,US,BZ | 484 | 52 | 112468855 | ##### | ^(\d{5})$ | .mx | ||
165 | NI | NIC | 129494.0 | Managua | NIO | Cordoba | NU | 3617476 | es-NI,en | Nicaragua | CR,HN | 558 | 505 | 5995928 | ###-###-# | ^(\d{7})$ | .ni | ||
173 | PA | PAN | 78200.0 | Panama City | PAB | Balboa | PM | 3703430 | es-PA,en | Panama | CR,CO | 591 | 507 | 3410676 | .pa | ||||
174 | PE | PER | 1285220.0 | Lima | SA | PEN | Sol | PE | 3932488 | es-PE,qu,ay | Peru | EC,CL,BO,BR,CO | 604 | 51 | 29907003 | .pe | |||
182 | PR | PRI | 9104.0 | San Juan | USD | Dollar | RQ | 4566966 | en-PR,es-PR | Puerto Rico | 630 | +1-787 and 1-939 | 3916632 | #####-#### | ^00[679]\d{2}(?:-\d{4})?$ | .pr | |||
186 | PY | PRY | 406750.0 | Asuncion | SA | PYG | Guarani | PA | 3437598 | es-PY,gn | Paraguay | BO,BR,AR | 600 | 595 | 6375830 | #### | ^(\d{4})$ | .py | |
187 | QA | QAT | 11437.0 | Doha | AS | QAR | Rial | QA | 289688 | ar-QA,es | Qatar | SA | 634 | 974 | 840926 | .qa | |||
210 | SV | SLV | 21040.0 | San Salvador | USD | Dollar | ES | 3585968 | es-SV | El Salvador | GT,HN | 222 | 503 | 6052064 | CP #### | ^(?:CP)*(\d{4})$ | .sv | ||
226 | TT | TTO | 5128.0 | Port of Spain | TTD | Dollar | TD | 3573591 | en-TT,hns,fr,es,zh | Trinidad and Tobago | 780 | +1-868 | 1228691 | .tt | |||||
233 | US | USA | 9629091.0 | Washington | USD | Dollar | US | 6252001 | en-US,es-US,haw,fr | United States | CA,MX,CU | 840 | 1 | 310232863 | #####-#### | ^\d{5}(-\d{4})?$ | .us | ||
234 | UY | URY | 176220.0 | Montevideo | SA | UYU | Peso | UY | 3439705 | es-UY | Uruguay | BR,AR | 858 | 598 | 3477000 | ##### | ^(\d{5})$ | .uy | |
238 | VE | VEN | 912050.0 | Caracas | SA | VEF | Bolivar | VE | 3625428 | es-VE | Venezuela | GY,BR,CO | 862 | 58 | 27223228 | #### | ^(\d{4})$ | .ve | |
251 | AN | ANT | 960.0 | Willemstad | ANG | Guilder | NT | 8505032 | nl-AN,en,es | Netherlands Antilles | GP | 530 | 599 | 300000 | .an |
df_espanol.shape
(29, 19)
df_espanol.set_index('alpha_3')[['population','area']].plot(kind='bar',rot=65,figsize=(20,10))
<matplotlib.axes._subplots.AxesSubplot at 0x11ef08898>
anomalies = []
# Funcion ejemplo para detección de outliers
def find_anomalies(data):
# Set upper and lower limit to 2 standard deviation
data_std = data.std()
data_mean = data.mean()
anomaly_cut_off = data_std * 2
lower_limit = data_mean - anomaly_cut_off
upper_limit = data_mean + anomaly_cut_off
print(lower_limit.iloc[0])
print(upper_limit.iloc[0])
# Generate outliers
for index, row in data.iterrows():
outlier = row # # obtener primer columna
# print(outlier)
if (outlier.iloc[0] > upper_limit.iloc[0]) or (outlier.iloc[0] < lower_limit.iloc[0]):
anomalies.append(index)
return anomalies
find_anomalies(df_espanol.set_index('alpha_3')[['population']])
-102872707.87855022 167589161.25786057
['BRA', 'USA']
# Quitemos BRA y USA por ser outlies y volvamos a graficar:
df_espanol.drop([30,233], inplace=True)
df_espanol.set_index('alpha_3')[['population','area']].plot(kind='bar',rot=65,figsize=(20,10))
<matplotlib.axes._subplots.AxesSubplot at 0x1235486a0>
df_espanol.set_index('alpha_3')[['population','area']].sort_values(["population"]).plot(kind='bar',rot=65,figsize=(20,10))
<matplotlib.axes._subplots.AxesSubplot at 0x123883a58>
4df_espanol.set_index('alpha_3')[['area']].sort_values(["area"]).plot(kind='bar',rot=65,figsize=(20,10))
<matplotlib.axes._subplots.AxesSubplot at 0x123cf0630>
# En este caso, podriamos quitar por "lo bajo", area menor a 110.000 km2:
df_2 = df_espanol.set_index('alpha_3')
df_2 = df_2[df_2['area'] > 110000]
df_2
alpha_2 | area | capital | continent | currency_code | currency_name | eqivalent_fips_code | fips | geoname_id | languages | name | neighbours | numeric | phone | population | postal_code_format | postal_code_regex | tld | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
alpha_3 | ||||||||||||||||||
ARG | AR | 2766890.0 | Buenos Aires | SA | ARS | Peso | AR | 3865483 | es-AR,en,it,de,fr,gn | Argentina | CL,BO,UY,PY,BR | 32 | 54 | 41343201 | @####@@@ | ^[A-Z]?\d{4}[A-Z]{0,3}$ | .ar | |
BOL | BO | 1098580.0 | Sucre | SA | BOB | Boliviano | BL | 3923057 | es-BO,qu,ay | Bolivia | PE,CL,PY,BR,AR | 68 | 591 | 9947418 | .bo | |||
CHL | CL | 756950.0 | Santiago | SA | CLP | Peso | CI | 3895114 | es-CL | Chile | PE,BO,AR | 152 | 56 | 16746491 | ####### | ^(\d{7})$ | .cl | |
COL | CO | 1138910.0 | Bogota | SA | COP | Peso | CO | 3686110 | es-CO | Colombia | EC,PE,PA,BR,VE | 170 | 57 | 47790000 | .co | |||
CUB | CU | 110860.0 | Havana | CUP | Peso | CU | 3562981 | es-CU | Cuba | US | 192 | 53 | 11423000 | CP ##### | ^(?:CP)*(\d{5})$ | .cu | ||
ECU | EC | 283560.0 | Quito | SA | USD | Dollar | EC | 3658394 | es-EC | Ecuador | PE,CO | 218 | 593 | 14790608 | @####@ | ^([a-zA-Z]\d{4}[a-zA-Z])$ | .ec | |
ESP | ES | 504782.0 | Madrid | EU | EUR | Euro | SP | 2510769 | es-ES,ca,gl,eu,oc | Spain | AD,PT,GI,FR,MA | 724 | 34 | 46505963 | ##### | ^(\d{5})$ | .es | |
HND | HN | 112090.0 | Tegucigalpa | HNL | Lempira | HO | 3608932 | es-HN | Honduras | GT,NI,SV | 340 | 504 | 7989415 | @@#### | ^([A-Z]{2}\d{4})$ | .hn | ||
MEX | MX | 1972550.0 | Mexico City | MXN | Peso | MX | 3996063 | es-MX | Mexico | GT,US,BZ | 484 | 52 | 112468855 | ##### | ^(\d{5})$ | .mx | ||
NIC | NI | 129494.0 | Managua | NIO | Cordoba | NU | 3617476 | es-NI,en | Nicaragua | CR,HN | 558 | 505 | 5995928 | ###-###-# | ^(\d{7})$ | .ni | ||
PER | PE | 1285220.0 | Lima | SA | PEN | Sol | PE | 3932488 | es-PE,qu,ay | Peru | EC,CL,BO,BR,CO | 604 | 51 | 29907003 | .pe | |||
PRY | PY | 406750.0 | Asuncion | SA | PYG | Guarani | PA | 3437598 | es-PY,gn | Paraguay | BO,BR,AR | 600 | 595 | 6375830 | #### | ^(\d{4})$ | .py | |
URY | UY | 176220.0 | Montevideo | SA | UYU | Peso | UY | 3439705 | es-UY | Uruguay | BR,AR | 858 | 598 | 3477000 | ##### | ^(\d{5})$ | .uy | |
VEN | VE | 912050.0 | Caracas | SA | VEF | Bolivar | VE | 3625428 | es-VE | Venezuela | GY,BR,CO | 862 | 58 | 27223228 | #### | ^(\d{4})$ | .ve |
df_2[['area']].sort_values(["area"]).plot(kind='bar',rot=65,figsize=(20,10))
<matplotlib.axes._subplots.AxesSubplot at 0x1c2777a048>