# Data Manipulation Tools
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler # For scaling dataset
from sklearn.cluster import KMeans #For clustering
# Data Visualization Tools
import seaborn as sns
import plotly.graph_objs as go # version=2.7.0
import matplotlib.pyplot as plt
# pd.options.display.float_format = '{:,.5f}'.format
# Transform df(Dataframe) to numeric dtype
df = pd.read_excel("Datasets_cleaned.xlsx", 0)
# cols = df.columns
# cols
print(df.columns)
Index(['indicator', 'population', 'gdp_per_cap', 'gdp', 'gdp_growth', 'gdp_per_km2', 'land_area', 'population_density', 'total_foreign_born', 'per_foreign_born', 'population_living_abroad_diaspora', 'population_living_abroad', 'happy_planet_index', 'world_happiness', 'human_development_index', 'health_expenditure', 'government_spending', 'sustainable_economic_development_assessment(seda)', 'employment', 'political_stability&absence_of_violence', 'control_of_corruption', 'judicial_effectiveness', 'government_integrity', 'property_rights', 'tax_burden', 'overall_economic_freedom', 'financial_freedom'], dtype='object')
print(df.describe())
population gdp_per_cap gdp gdp_per_km2 \ count 67.00000 67.00000 67.00000 67.00000 mean 1,873,134.32836 42.46866 21,050.87015 4,926,014.63806 std 1,565,981.35458 70.89179 24,102.55335 13,309,816.67525 min 100,000.00000 0.20000 651.90000 5,136.56852 25% 450,000.00000 3.45000 6,107.70000 199,950.89965 50% 1,300,000.00000 18.40000 11,840.20000 812,864.46369 75% 3,000,000.00000 40.35000 26,656.00000 2,714,976.69619 max 4,900,000.00000 329.20000 127,659.60000 88,026,315.78947 land_area population_density government_spending count 67.00000 67.00000 67.00000 mean 112,900.59701 187.12315 58.40149 std 261,572.49117 345.60623 20.51883 min 298.00000 1.97974 0.00000 25% 4,580.50000 26.84716 47.00000 50% 22,806.00000 73.72588 63.40000 75% 65,781.50000 187.86034 72.70000 max 1,553,556.00000 1,935.90662 94.60000
## Clean columns
df = df.drop(columns=["happy_planet_index","world_happiness"], errors='ignore')
# Transform to float type
for i in df.columns[1:]:
# The reason to use pd.to_numeric instead of astyle
# is that the former transforms invalid data to NaN,
# while the other does not.
df[i] = pd.to_numeric(df[i],downcast='float', errors='coerce')
missing values:drop rows that have more than 20% NAN, and drop happiness planet and world happiness index
## Clean rows
df = df.dropna(thresh=len(df.columns)*0.7)
df
indicator | population | gdp_per_cap | gdp | gdp_growth | gdp_per_km2 | land_area | population_density | total_foreign_born | per_foreign_born | ... | sustainable_economic_development_assessment(seda) | employment | political_stability&absence_of_violence | control_of_corruption | judicial_effectiveness | government_integrity | property_rights | tax_burden | overall_economic_freedom | financial_freedom | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Albania | 2,900,000.00000 | 34.20000 | 11,840.20020 | 1.50000 | 1,248,266.25000 | 27,398.00000 | 104.87070 | 57,616.00000 | 3.10000 | ... | 53.10000 | 91.50000 | -2.78000 | -1.52000 | 28.20000 | 26.20000 | 17.90000 | 91.80000 | 51.30000 | 10.00000 |
1 | Armenia | 3,000,000.00000 | 25.80000 | 8,621.00000 | 2.60000 | 914,796.31250 | 28,203.00000 | 102.93115 | 191,199.00000 | 10.60000 | ... | 51.70000 | 83.70000 | 0.40000 | -0.42000 | 25.40000 | 39.90000 | 54.10000 | 85.10000 | 64.50000 | 70.00000 |
2 | Bahamas | 400,000.00000 | 9.00000 | 24,555.19922 | 3.70000 | 899,100.87500 | 10,010.00000 | 39.49660 | 59,306.00000 | 16.30000 | ... | 60.40000 | 88.80000 | -0.96000 | -0.61000 | 35.20000 | 29.00000 | 27.80000 | 74.00000 | 44.70000 | 30.00000 |
4 | Barbados | 300,000.00000 | 4.80000 | 17,100.19922 | 3.00000 | 11,162,791.00000 | 430.00000 | 664.46277 | 34,475.00000 | 11.30000 | ... | 60.80000 | 93.40000 | -0.29000 | -1.41000 | 25.40000 | 18.90000 | 36.00000 | 82.40000 | 48.60000 | 40.00000 |
6 | Bhutan | 800,000.00000 | 6.50000 | 8,227.40039 | 1.20000 | 169,297.28125 | 38,394.00000 | 21.18766 | 51,106.00000 | 6.70000 | ... | 47.70000 | 93.40000 | 0.18000 | -0.26000 | 44.50000 | 32.60000 | 40.80000 | 65.70000 | 52.30000 | 60.00000 |
7 | Bosnia and Herzegovina | 3,900,000.00000 | 42.20000 | 10,957.50000 | 3.00000 | 824,428.06250 | 51,187.00000 | 68.49643 | 34,803.00000 | 0.60000 | ... | 50.20000 | 83.20000 | -0.71000 | -0.56000 | 47.40000 | 40.50000 | 55.30000 | 84.70000 | 68.70000 | 70.00000 |
8 | Botswana | 2,200,000.00000 | 36.70000 | 17,041.59961 | 2.50000 | 64,757.46875 | 566,730.00000 | 4.04366 | 160,644.00000 | 7.20000 | ... | 44.10000 | 94.30000 | 0.90000 | 1.80000 | 93.40000 | 77.40000 | 78.70000 | 63.00000 | 80.90000 | 90.00000 |
9 | Brunei Darussalam | 400,000.00000 | 32.50000 | 76,884.00000 | 0.90000 | 6,172,839.50000 | 5,265.00000 | 81.34668 | 102,733.00000 | 49.30000 | ... | 72.50000 | 93.90000 | 1.04000 | 1.53000 | 80.90000 | 73.50000 | 83.50000 | 49.90000 | 71.80000 | 70.00000 |
10 | Cabo Verde | 500,000.00000 | 3.50000 | 6,662.00000 | 1.10000 | 867,840.31250 | 4,033.00000 | 135.58015 | 14,924.00000 | 3.00000 | ... | nan | 94.90000 | -0.76000 | -0.88000 | 36.80000 | 39.90000 | 53.60000 | 87.50000 | 64.30000 | 60.00000 |
11 | Central African Republic | 4,900,000.00000 | 3.20000 | 651.90002 | 0.50000 | 5,136.56836 | 622,984.00000 | 7.47870 | 81,598.00000 | 2.90000 | ... | 16.10000 | 84.70000 | 1.00000 | 1.17000 | 53.50000 | 50.90000 | 46.50000 | 96.50000 | 63.30000 | 60.00000 |
12 | Comoros | 800,000.00000 | 1.30000 | 1,529.19995 | 3.20000 | 581,655.50000 | 2,235.00000 | 437.35196 | 12,555.00000 | 1.70000 | ... | nan | 98.70000 | -0.95000 | -0.14000 | 55.10000 | 51.80000 | 62.10000 | 99.90000 | 67.70000 | 80.00000 |
13 | Congo | 4,500,000.00000 | 29.80000 | 6,676.10010 | 6.40000 | 87,262.07812 | 341,500.00000 | 15.40483 | 392,996.00000 | 9.70000 | ... | 32.70000 | 95.90000 | -1.25000 | -0.83000 | 32.60000 | 21.20000 | 32.40000 | 72.70000 | 55.10000 | 30.00000 |
14 | Costa Rica | 4,900,000.00000 | 80.70000 | 16,435.80078 | 0.50000 | 1,580,493.50000 | 51,060.00000 | 96.07851 | 421,697.00000 | 8.70000 | ... | 58.90000 | 88.60000 | 0.98000 | 1.42000 | 54.40000 | 53.80000 | 51.40000 | 74.00000 | 57.00000 | 60.00000 |
15 | Croatia | 4,200,000.00000 | 95.10000 | 22,795.30078 | -3.90000 | 1,699,003.12500 | 55,974.00000 | 73.72588 | 576,883.00000 | 17.60000 | ... | 64.50000 | 99.50000 | 0.03000 | -0.26000 | 57.30000 | 42.00000 | 53.50000 | 89.80000 | 58.10000 | 10.00000 |
16 | Cyprus | 800,000.00000 | 29.70000 | 34,970.30078 | 1.40000 | 3,213,938.00000 | 9,241.00000 | 127.65704 | 196,167.00000 | 18.20000 | ... | 65.90000 | 91.70000 | 0.42000 | 1.50000 | 69.50000 | 70.90000 | 81.20000 | 44.00000 | 67.50000 | 70.00000 |
17 | Djibouti | 1,000,000.00000 | 3.30000 | 3,369.60010 | 1.50000 | 142,364.10938 | 23,180.00000 | 41.28494 | 112,351.00000 | 14.20000 | ... | nan | 89.00000 | 0.05000 | -0.27000 | 53.50000 | 34.70000 | 42.10000 | 80.00000 | 57.10000 | 50.00000 |
18 | Dominica | 100,000.00000 | 0.80000 | 11,374.59961 | 5.20000 | 1,065,246.37500 | 751.00000 | 98.56667 | 6,720.00000 | 8.90000 | ... | nan | 99.00000 | 0.05000 | -0.55000 | 31.30000 | 30.20000 | 35.50000 | 67.40000 | 56.70000 | 50.00000 |
19 | Equatorial Guinea | 800,000.00000 | 31.70000 | 38,639.10156 | 7.70000 | 1,130,084.50000 | 28,051.00000 | 45.19390 | 10,825.00000 | 1.30000 | ... | nan | 97.60000 | 1.13000 | 1.57000 | 51.60000 | 50.90000 | 60.90000 | 83.00000 | 61.80000 | 30.00000 |
20 | Estonia | 1,300,000.00000 | 38.50000 | 29,312.90039 | 4.80000 | 908,275.93750 | 42,388.00000 | 31.03279 | 202,348.00000 | 16.40000 | ... | 70.80000 | 96.30000 | -0.30000 | -0.66000 | 11.40000 | 23.10000 | 19.00000 | 85.70000 | 44.10000 | 40.00000 |
21 | Eswatini | 1,100,000.00000 | 11.10000 | 9,775.79980 | 2.80000 | 645,198.81250 | 17,204.00000 | 79.49151 | 31,579.00000 | 2.00000 | ... | nan | 74.20000 | -0.38000 | -0.52000 | 43.70000 | 28.40000 | 39.50000 | 83.50000 | 61.40000 | 60.00000 |
22 | Fiji | 900,000.00000 | 8.30000 | 9,268.09961 | -0.30000 | 454,197.21875 | 18,274.00000 | 49.56223 | 13,751.00000 | 2.50000 | ... | 49.00000 | 81.60000 | 1.03000 | 0.80000 | 54.70000 | 56.60000 | 57.70000 | 76.10000 | 69.90000 | 70.00000 |
23 | Gabon | 1,900,000.00000 | 35.90000 | 19,056.50000 | -3.80000 | 139,327.10938 | 257,667.00000 | 7.85942 | 268,384.00000 | 23.60000 | ... | 40.90000 | 88.50000 | -0.41000 | -0.53000 | 55.50000 | 31.40000 | 55.80000 | 70.60000 | 51.40000 | 50.00000 |
24 | Gambia | 2,100,000.00000 | 3.40000 | 1,666.59998 | -0.20000 | 335,968.37500 | 10,120.00000 | 207.56601 | 192,540.00000 | 8.80000 | ... | 40.90000 | 98.00000 | 1.19000 | 0.71000 | 57.10000 | 45.40000 | 56.60000 | 85.60000 | 64.20000 | 50.00000 |
25 | Georgia | 3,700,000.00000 | 37.20000 | 10,043.79980 | 3.00000 | 533,715.93750 | 69,700.00000 | 65.03201 | 168,802.00000 | 4.40000 | ... | 55.40000 | 92.00000 | 0.37000 | -0.16000 | 42.50000 | 38.20000 | 63.60000 | 90.90000 | 68.30000 | 60.00000 |
26 | Guinea-Bissau | 1,700,000.00000 | 2.90000 | 1,729.90002 | 4.00000 | 103,129.44531 | 28,120.00000 | 66.19072 | 22,333.00000 | 1.10000 | ... | nan | 97.00000 | -0.92000 | -0.11000 | 47.10000 | 31.80000 | 42.10000 | 80.60000 | 60.00000 | 40.00000 |
27 | Guyana | 800,000.00000 | 6.00000 | 7,872.89990 | -4.10000 | 30,480.21484 | 196,849.00000 | 3.95153 | 15,384.00000 | 1.70000 | ... | 42.80000 | 98.40000 | -1.97000 | -1.28000 | 21.70000 | 26.20000 | 17.60000 | 71.00000 | 50.90000 | 30.00000 |
28 | Iceland | 300,000.00000 | 16.50000 | 49,135.60156 | 1.80000 | 164,588.53125 | 100,250.00000 | 3.40433 | 37,522.00000 | 10.70000 | ... | 83.70000 | 89.50000 | 0.90000 | 0.84000 | 52.00000 | 42.80000 | 42.10000 | 74.00000 | 60.00000 | 60.00000 |
29 | Ireland | 4,700,000.00000 | 324.89999 | 69,230.79688 | 6.90000 | 4,716,693.50000 | 68,883.00000 | 69.87383 | 746,260.00000 | 15.90000 | ... | 80.30000 | 99.70000 | 0.17000 | -1.29000 | 24.50000 | 17.70000 | 36.00000 | 89.80000 | 58.70000 | 50.00000 |
30 | Jamaica | 2,800,000.00000 | 25.40000 | 8,975.70020 | 5.90000 | 2,345,120.50000 | 10,831.00000 | 266.87894 | 23,167.00000 | 1.30000 | ... | 49.70000 | 95.50000 | -1.08000 | -1.18000 | 29.40000 | 23.40000 | 40.60000 | 73.70000 | 51.90000 | 50.00000 |
31 | Kiribati | 100,000.00000 | 0.20000 | 1,823.30005 | 1.20000 | 246,609.12500 | 811.00000 | 143.70123 | 3,153.00000 | 2.60000 | ... | nan | 92.90000 | 1.11000 | 1.92000 | 77.10000 | 78.30000 | 87.50000 | 76.70000 | 77.70000 | 80.00000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
37 | Liberia | 4,400,000.00000 | 3.80000 | 855.09998 | 1.00000 | 39,451.82812 | 96,320.00000 | 49.12693 | 113,779.00000 | 5.30000 | ... | 30.60000 | 80.00000 | 0.03000 | -0.68000 | 28.20000 | 27.50000 | 36.70000 | 59.70000 | 56.20000 | 30.00000 |
38 | Lithuania | 2,900,000.00000 | 86.10000 | 29,972.30078 | 7.70000 | 1,373,643.87500 | 62,680.00000 | 45.13521 | 136,036.00000 | 4.90000 | ... | 67.40000 | 96.40000 | -2.30000 | -1.42000 | 23.90000 | 27.30000 | 24.10000 | 73.20000 | 52.10000 | 20.00000 |
39 | Luxembourg | 600,000.00000 | 59.90000 | 104,003.29688 | 2.50000 | 23,163,186.00000 | 2,586.00000 | 231.44749 | 249,325.00000 | 43.30000 | ... | 83.20000 | 88.80000 | -0.53000 | -1.33000 | 28.20000 | 24.60000 | 32.40000 | 60.80000 | 38.90000 | 30.00000 |
40 | Macedonia | 2,100,000.00000 | 30.30000 | 14,597.00000 | 3.70000 | 1,191,365.50000 | 25,433.00000 | 82.59953 | 130,730.00000 | 6.60000 | ... | 52.50000 | 91.00000 | 0.51000 | 0.47000 | 57.80000 | 51.80000 | 54.80000 | 79.30000 | 65.60000 | 50.00000 |
41 | Maldives | 400,000.00000 | 5.50000 | 15,553.00000 | 8.60000 | 18,456,376.00000 | 298.00000 | 1,454.43335 | 94,086.00000 | 24.40000 | ... | nan | 90.70000 | -1.09000 | -0.52000 | 44.20000 | 36.60000 | 39.40000 | 76.40000 | 62.00000 | 50.00000 |
42 | Malta | 400,000.00000 | 17.30000 | 39,833.80078 | 1.60000 | 54,746,836.00000 | 316.00000 | 1,454.03748 | 41,442.00000 | 8.00000 | ... | 68.80000 | 86.50000 | 0.75000 | 0.19000 | 56.50000 | 40.50000 | 65.90000 | 66.00000 | 61.00000 | 60.00000 |
43 | Mauritania | 3,800,000.00000 | 16.40000 | 4,328.00000 | 4.30000 | 15,911.51660 | 1,030,700.00000 | 4.28853 | 138,162.00000 | 2.30000 | ... | 28.90000 | 97.10000 | 0.69000 | 0.27000 | 10.00000 | 38.10000 | 29.70000 | 49.00000 | 31.90000 | 10.00000 |
44 | Mauritius | 1,300,000.00000 | 25.90000 | 20,421.59961 | 1.60000 | 12,758,621.00000 | 2,030.00000 | 622.96210 | 28,585.00000 | 3.60000 | ... | 60.20000 | 88.30000 | 0.60000 | 0.78000 | 56.70000 | 41.30000 | 71.20000 | 75.20000 | 67.80000 | 60.00000 |
45 | Micronesia | 100,000.00000 | 0.30000 | 3,234.30005 | 4.20000 | 427,350.43750 | 702.00000 | 150.77715 | 2,756.00000 | 2.60000 | ... | nan | 96.00000 | 1.02000 | 0.57000 | 57.90000 | 51.10000 | 73.00000 | 82.90000 | 74.20000 | 80.00000 |
46 | Moldova | 3,600,000.00000 | 18.90000 | 5,328.00000 | 1.20000 | 574,625.25000 | 32,891.00000 | 123.65451 | 142,904.00000 | 11.20000 | ... | 48.70000 | 93.90000 | 0.87000 | 2.19000 | 83.60000 | 84.10000 | 84.80000 | 41.40000 | 76.60000 | 80.00000 |
47 | Mongolia | 3,000,000.00000 | 37.00000 | 12,274.90039 | 6.50000 | 23,816.32812 | 1,553,556.00000 | 1.97974 | 17,620.00000 | 0.60000 | ... | 50.60000 | 93.40000 | -0.71000 | -0.64000 | 13.80000 | 29.00000 | 19.00000 | 69.80000 | 45.10000 | 50.00000 |
48 | Montenegro | 600,000.00000 | 10.40000 | 16,643.19922 | -4.30000 | 773,119.25000 | 13,452.00000 | 46.28037 | 82,541.00000 | 8.20000 | ... | nan | nan | 1.04000 | 0.65000 | 69.20000 | 49.90000 | 48.60000 | 72.90000 | 64.50000 | 30.00000 |
49 | Namibia | 2,300,000.00000 | 26.00000 | 11,289.70020 | 7.00000 | 31,580.60938 | 823,290.00000 | 3.07764 | 93,888.00000 | 2.20000 | ... | 39.10000 | 85.60000 | 0.16000 | -0.74000 | 23.10000 | 26.20000 | 51.70000 | 84.60000 | 61.60000 | 40.00000 |
50 | New Zealand | 4,700,000.00000 | 177.00000 | 37,294.00000 | 0.00000 | 669,093.56250 | 264,537.00000 | 18.20630 | 1,039,736.00000 | 25.10000 | ... | 78.60000 | 94.60000 | -0.10000 | -0.60000 | 23.30000 | 30.20000 | 36.70000 | 79.40000 | 48.50000 | 40.00000 |
51 | Oman | 4,000,000.00000 | 184.80000 | 46,698.00000 | 4.20000 | 597,092.06250 | 309,500.00000 | 14.97985 | 1,844,978.00000 | 30.60000 | ... | 62.10000 | 88.00000 | -1.42000 | -0.54000 | 52.50000 | 32.20000 | 32.70000 | 84.20000 | 53.40000 | 50.00000 |
52 | Panama | 4,000,000.00000 | 92.90000 | 23,023.90039 | 2.40000 | 1,249,663.75000 | 74,340.00000 | 55.13300 | 184,710.00000 | 4.70000 | ... | 54.50000 | 93.70000 | -0.25000 | -0.51000 | 35.40000 | 25.20000 | 37.30000 | 78.90000 | 63.20000 | 60.00000 |
53 | Qatar | 2,600,000.00000 | 329.20001 | 127,659.60156 | -12.20000 | 28,413,602.00000 | 11,586.00000 | 227.32222 | 1,687,640.00000 | 73.80000 | ... | 70.80000 | 92.70000 | -0.15000 | -1.83000 | 17.60000 | 26.20000 | 29.80000 | 69.60000 | 42.00000 | 30.00000 |
54 | Saint Lucia | 200,000.00000 | 2.10000 | 11,782.90039 | 4.80000 | 3,465,346.50000 | 606.00000 | 293.18689 | 12,771.00000 | 6.70000 | ... | nan | 92.70000 | -0.66000 | -1.20000 | 13.80000 | 23.40000 | 35.50000 | 79.90000 | 41.70000 | 20.00000 |
55 | Saint Vincent and the Grenadines | 100,000.00000 | 1.20000 | 11,270.50000 | 1.10000 | 3,084,833.00000 | 389.00000 | 281.78717 | 4,577.00000 | 9.40000 | ... | nan | 93.10000 | 0.66000 | 1.24000 | 83.90000 | 75.70000 | 80.40000 | 80.70000 | 78.80000 | 80.00000 |
56 | Samoa | 200,000.00000 | 1.10000 | 5,553.39990 | 1.70000 | 389,932.65625 | 2,821.00000 | 69.41343 | 4,929.00000 | 3.00000 | ... | nan | 74.70000 | -0.30000 | -0.30000 | 35.30000 | 26.90000 | 55.30000 | 74.80000 | 55.90000 | 40.00000 |
57 | Sao Tome | 200,000.00000 | 0.60000 | 3,071.80005 | 10.20000 | 622,406.62500 | 964.00000 | 212.84062 | 2,394.00000 | 3.30000 | ... | nan | 94.30000 | -1.69000 | -0.56000 | 37.60000 | 37.70000 | 31.10000 | 76.50000 | 52.80000 | 20.00000 |
58 | Seychelles | 100,000.00000 | 2.60000 | 27,602.19922 | 4.30000 | 5,714,285.50000 | 455.00000 | 208.35435 | 12,791.00000 | 13.00000 | ... | 59.10000 | 92.10000 | 0.79000 | 0.36000 | 47.10000 | 34.80000 | 68.30000 | 81.40000 | 62.00000 | 50.00000 |
59 | Slovenia | 2,100,000.00000 | 66.20000 | 32,084.90039 | 0.40000 | 3,285,196.75000 | 20,151.00000 | 102.61906 | 235,966.00000 | 11.30000 | ... | 72.60000 | 91.00000 | 1.07000 | 2.22000 | 82.70000 | 89.80000 | 89.00000 | 66.50000 | 74.10000 | 80.00000 |
60 | Solomon Islands | 600,000.00000 | 1.20000 | 1,972.59998 | 1.10000 | 42,878.58203 | 27,986.00000 | 21.84148 | 2,585.00000 | 1.40000 | ... | nan | 90.00000 | 0.21000 | 1.26000 | 72.70000 | 65.10000 | 84.00000 | 47.30000 | 63.90000 | 70.00000 |
61 | Suriname | 600,000.00000 | 7.90000 | 13,988.20020 | 4.00000 | 50,641.02734 | 156,000.00000 | 3.61155 | 46,836.00000 | 7.50000 | ... | 52.70000 | 81.50000 | -0.09000 | -0.81000 | 27.80000 | 33.40000 | 29.90000 | 74.30000 | 58.00000 | 40.00000 |
62 | Timor-Leste | 1,200,000.00000 | 5.00000 | 4,186.60010 | 4.40000 | 336,157.06250 | 14,874.00000 | 87.17626 | 10,834.00000 | 1.00000 | ... | nan | 70.30000 | nan | nan | 38.80000 | 36.80000 | 34.40000 | 71.90000 | 52.30000 | 50.00000 |
63 | Tonga | 100,000.00000 | 0.60000 | 5,386.50000 | 2.80000 | 836,820.06250 | 717.00000 | 150.02777 | 5,731.00000 | 4.80000 | ... | nan | 88.40000 | -0.37000 | 0.74000 | 64.20000 | 61.80000 | 62.80000 | 87.00000 | 76.20000 | 60.00000 |
64 | Trinidad and Tobago | 1,400,000.00000 | 43.60000 | 31,870.30078 | 1.50000 | 8,502,340.00000 | 5,128.00000 | 266.88596 | 49,883.00000 | 2.40000 | ... | 58.70000 | 95.70000 | 0.58000 | 1.84000 | 78.00000 | 75.30000 | 81.00000 | 61.30000 | 74.20000 | 70.00000 |
65 | Uruguay | 3,500,000.00000 | 74.90000 | 21,527.30078 | 3.50000 | 427,963.31250 | 175,015.00000 | 19.75060 | 71,799.00000 | 2.30000 | ... | 64.20000 | 94.20000 | 0.09000 | -0.23000 | 43.70000 | 32.90000 | 48.90000 | 83.50000 | 56.00000 | 60.00000 |
66 | Vanuatu | 300,000.00000 | 0.70000 | 2,631.39990 | -0.20000 | 57,428.82812 | 12,189.00000 | 22.66153 | 3,187.00000 | 1.30000 | ... | -64.50000 | 76.10000 | -0.13000 | -0.14000 | 59.00000 | 37.90000 | 52.30000 | 60.40000 | 57.30000 | 40.00000 |
65 rows × 25 columns
It turned out that normalization (linearly from 0 to 1) isn't good for a dataset like this for too many outliers. We switch to standarization
# Normalize the data except a few cols
# # Create a minimum and maximum processor object
# min_max_scaler = preprocessing.MinMaxScaler()
# # Create an object to transform the data to fit minmax processor
# df_norm[df_norm.columns[1:]] = min_max_scaler.fit_transform(df[df_norm.columns[1:]])
## Data Standarization
df_norm = df.copy()
# std = std.dropna(how='any')
df_norm[df_norm.columns[1:]] = preprocessing.scale(df_norm[df_norm.columns[1:]])
# df_norm.describe()
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:16: DataConversionWarning: Data with input dtype float32 were all converted to float64 by the scale function.
population | gdp_per_cap | gdp | gdp_growth | gdp_per_km2 | land_area | population_density | total_foreign_born | per_foreign_born | population_living_abroad_diaspora | ... | sustainable_economic_development_assessment(seda) | employment | political_stability&absence_of_violence | control_of_corruption | judicial_effectiveness | government_integrity | property_rights | tax_burden | overall_economic_freedom | financial_freedom | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 65.00000 | 65.00000 | 65.00000 | 65.00000 | 65.00000 | 65.00000 | 65.00000 | 64.00000 | 64.00000 | 64.00000 | ... | 46.00000 | 64.00000 | 64.00000 | 64.00000 | 65.00000 | 65.00000 | 65.00000 | 65.00000 | 65.00000 | 65.00000 |
mean | 0.00000 | -0.00000 | -0.00000 | -0.00000 | 0.00000 | -0.00000 | 0.00000 | -0.00000 | -0.00000 | -0.00000 | ... | 0.00000 | -0.00000 | -0.00000 | 0.00000 | -0.00000 | 0.00000 | 0.00000 | -0.00000 | -0.00000 | 0.00000 |
std | 1.00778 | 1.00778 | 1.00778 | 1.00778 | 1.00778 | 1.00778 | 1.00778 | 1.00791 | 1.00791 | 1.00791 | ... | 1.01105 | 1.00791 | 1.00791 | 1.00791 | 1.00778 | 1.00778 | 1.00778 | 1.00778 | 1.00778 | 1.00778 |
min | -1.15279 | -0.59686 | -0.84082 | -4.32306 | -0.43846 | -0.44005 | -0.59460 | -0.46968 | -0.76103 | -0.78808 | ... | -5.11444 | -3.27375 | -2.90097 | -1.75474 | -1.78433 | -1.34947 | -1.63901 | -2.64631 | -2.66445 | -2.05991 |
25% | -0.89727 | -0.55052 | -0.63619 | -0.40885 | -0.41186 | -0.42169 | -0.48721 | -0.44627 | -0.59990 | -0.63302 | ... | -0.32448 | -0.38002 | -0.65938 | -0.62169 | -0.87710 | -0.79128 | -0.71615 | -0.38544 | -0.70664 | -0.52090 |
50% | -0.32235 | -0.34126 | -0.37373 | 0.00317 | -0.34316 | -0.35303 | -0.32941 | -0.34889 | -0.31325 | -0.42974 | ... | 0.08584 | 0.28898 | 0.14196 | -0.23752 | -0.06457 | -0.26157 | -0.04077 | 0.13568 | 0.03233 | -0.00789 |
75% | 0.69973 | -0.05896 | 0.20531 | 0.53291 | -0.16237 | -0.17923 | 0.01963 | -0.10795 | 0.12774 | 0.22297 | ... | 0.56037 | 0.64129 | 0.83230 | 0.75451 | 0.57348 | 0.54154 | 0.65524 | 0.68887 | 0.75212 | 0.50511 |
max | 1.91345 | 4.02372 | 4.46156 | 2.26929 | 6.02048 | 5.46691 | 4.77406 | 5.00718 | 4.23914 | 3.99662 | ... | 1.33748 | 1.38155 | 1.39811 | 2.18420 | 2.37300 | 2.75721 | 2.04210 | 2.04378 | 2.03813 | 2.04413 |
8 rows × 24 columns
# Subset the data to key cols
# Visualize correlation in a heapmap
cols_name = df_norm.columns
df_corr = df_norm[cols_name]
cor = df_corr[1:].corr()
# Color palette
cmap = sns.cubehelix_palette(light=1, as_cmap=True)
sns.heatmap(cor,
# square = True,
linewidths = 1,
cmap = cmap)
<matplotlib.axes._subplots.AxesSubplot at 0x11a27d400>
#K means Clustering
df_kmeans = df_norm.dropna(axis='index', how='any')
def doKmeans(X, nclust=2):
model = KMeans(nclust)
model.fit(X)
clust_labels = model.predict(X)
cent = model.cluster_centers_
return (clust_labels, cent)
sample = df_kmeans[df_kmeans.columns[1:]]
# print(sample)
clust_labels, cent = doKmeans(sample, 4)
kmeans = pd.DataFrame(clust_labels)
sample.insert((sample.shape[1]),'kmeans',kmeans)
fig = plt.figure()
ax = fig.add_subplot(111)
axis1 = cols_name[1]
axis2 = cols_name[2]
print(axis1, axis2)
scatter = ax.scatter(sample[axis1],sample[axis2],
c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel(axis1)
ax.set_ylabel(axis2)
plt.colorbar(scatter)
population gdp_per_cap
<matplotlib.colorbar.Colorbar at 0x11c3d5908>
#choropleth-maps
#Ref: https://plot.ly/python/choropleth-maps/
import plotly.plotly as py #For World Map
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
axis = cols_name[2] #'gdp_per_capita'
# axis = 'population'
data = dict(type = 'choropleth',
locations = df_norm['indicator'],
locationmode = 'country names',
z = df_norm[axis],
text = df_norm['indicator'],
colorbar = {'title':axis},
marker = dict(
line = dict (
# color = 'rgb(180,180,180)',
width = 0.5
) )
)
scope = 'globe'
# scope = 'asia'
layout = dict(title = axis2,
geo = dict(scope=scope,
showframe = False,
projection = {'type': 'Mercator'})
)
choromap3 = go.Figure(data = [data], layout=layout)
iplot(choromap3)
import country_converter as converter
countries = list(df_norm.indicator.values)
df_norm['continent'] = converter.convert(names=countries, to='continent')
# df_norm['continent']
df_norm[cols_name[1:]].loc[0].values
array([ 0.63585082, -0.11935518, -0.37372741, -0.29113191, -0.29178519, -0.33699407, -0.21428809, -0.36406527, -0.55749416, 2.46398993, 1.29700745, 0.48616003, -0.35585942, 0.70427717, 0.00529987, 0.0831304 , -2.90096556, -1.45324533, -0.87709562, -0.86532397, -1.6235437 , 1.39438407, -0.8026131 , -2.059913 ])
directions=dict(
life=['population_density','human_development_index','health_expenditure','employment'],
economics=['gdp_per_cap','gdp_growth',
'sustainable_economic_development_assessment(seda)','overall_economic_freedom'],
government=['political_stability&absence_of_violence','control_of_corruption',
'tax_burden','government_spending'],
laws=['property_rights','government_integrity',
'judicial_effectiveness','financial_freedom'],
migration=['total_foreign_born','per_foreign_born',
'population_living_abroad_diaspora','population_living_abroad']
)
Legend
# import plotly.plotly as py
# import plotly.graph_objs as go
data = []
color = "rgba(255,255,255,0)"
def format(l):
return list(l).append(list(l)[0])
# theta = list(cols_name[1:])
# theta.append(cols_name[1])
# print(theta)
# r=list(df_norm[cols_name[1:]].iloc[i].values)+[df_norm[cols_name[1]].iloc[i]]
# order = []
# for k, v in directions.items():
# order += v
# print(order)
alpha='99'
colors=['D94F70'+alpha,'F0C05A'+alpha,'45B5AA'+alpha,'5A5B9F'+alpha,'9B1B30'+alpha]
colors=['45B5AA','ffa600', 'ff6361','bc5090',
# '58508d',
'003f5c']
colors=[i+alpha for i in colors]
extra_theta=['a','b','c','d','e','f','g','h','i']
range_min = -5
range_max = 5
offset = 2.5
bgcolor="rgb(224,225,227)"
for i in range(len(df_norm)):
if i >=1:
break
for index,k in enumerate(directions.keys()):
r = df_norm[directions[k]].iloc[i].values
theta = directions[k].copy()
r = list(r)
prevkey = extra_theta[(index)%len(extra_theta)]
nextkey=extra_theta[(index+1)%len(extra_theta)]
r = [range_min+offset] + r + [range_min+offset]
theta = [prevkey] + theta + [nextkey]
data.append(go.Scatterpolar(
name = df_norm['indicator'].iloc[i], #Nation as the trace name
mode = "lines,markers",
marker = dict(
# color = "rgba(180,120,150,0.4)",
color = colors[index]
),
r=r,
theta=theta,
line = dict(
shape = "spline",
# color = "rgba(180,120,150,0.3)",
color=[colors[index]],
# dash='dot', #"solid", "dot", "dash", "longdash", "dashdot", or "longdashdot"
smoothing = 1.3,
width=4
),
hoverlabel=dict(
namelength=20
)
))
#Add a black angle
data.append(go.Scatterpolar(
name = df_norm['indicator'].iloc[i], #Nation as the trace name
mode = "lines",
marker = dict(
color = "rgba(180,120,150,0)",
),
r=[0,0,0,0,0],
theta=['f','g','h','i','j'],
line = dict(
# shape = "spline",
color=[colors[index]],
smoothing = 1.3
),
hoverlabel=dict(
namelength=20
)
)
)
layout = go.Layout(
showlegend = False,
polar=dict(
bgcolor=bgcolor,
sector=[119,420],
angularaxis=dict(
showline=False, # the grid's outline
# visible=True,
showticklabels=False,
tickfont = dict(
size = 8
),
ticks = "outside",
nticks = 30,
direction = "clockwise",
rotation = 60,
# gridcolor = "rgb(133,134,136,0.4)",
gridcolor="white",
gridwidth = 1
),
radialaxis=dict(
angle = 60,
showline=False, # the tick line
showticklabels=False,
visible=True,
nticks=4,
range = [range_min,range_max],
# gridcolor = "rgb(133,134,136)",
gridwidth = 2,
gridcolor = "white", #the tick line
)
),
paper_bgcolor=bgcolor,
)
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(width=800,height=800)
iplot(fig)
Legend of the markers and lines
data=[]
for i in range(len(df_norm)):
if i <1:
continue
if i >2:
break
for index,k in enumerate(directions.keys()):
# if index>=3:
# break
r = df_norm[directions[k]].iloc[i].values
theta = directions[k].copy()
r = list(r)
prevkey = extra_theta[(index)%len(extra_theta)]
nextkey=extra_theta[(index+1)%len(extra_theta)]
r = [range_min+offset] + r + [range_min+offset]
theta = [prevkey] + theta + [nextkey]
data.append(go.Scatterpolar(
name = "Country "+str(i), #Nation as the trace name
mode = "markers,lines",
marker = dict(
color = colors[index],
size=14
),
r=r,
theta=theta,
line = dict(
shape = "spline",
color=[colors[index]],
smoothing = 1.3,
width=8
),
hoverlabel=dict(
namelength=20
)
))
#Add a black angle
data.append(go.Scatterpolar(
name = df_norm['indicator'].iloc[i], #Nation as the trace name
mode = "lines",
marker = dict(
color = "rgba(180,120,150,0)",
),
r=[0,0,0,0,0],
theta=['f','g','h','i','j'],
line = dict(
# shape = "spline",
color=[colors[index]],
smoothing = 1.3
),
hoverlabel=dict(
namelength=20
)
)
)
layout = go.Layout(
showlegend = False,
polar=dict(
bgcolor=bgcolor,
sector=[0,59],
angularaxis=dict(
showline=False, # the grid's outline
# visible=True,
showticklabels=False,
tickfont = dict(
size = 8
),
ticks = "outside",
nticks = 30,
direction = "clockwise",
rotation = 60,
gridcolor = "rgba(133,134,136,0.7)",
gridwidth = 4
),
radialaxis=dict(
# angle = 60,
showline=True, # the tick line
showticklabels=True,
visible=True,
nticks=6,
range = [range_min,range_max],
gridcolor = "rgba(255,255,255,0.4)",
gridwidth = 8,
# linecolor = "white", #the tick line
tickfont=dict(
size=28,
family="doris serif")
)
),
paper_bgcolor=bgcolor,
)
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(width=800,height=800)
iplot(fig)
df_sorted = df_norm.sort_values(by=['continent', 'indicator'])
## Access row using .iloc[]
# df_sorted.iloc[0]
from plotly import tools
from random import randint
total= df_norm['continent'].nunique()
traces = []
subplot_titles = []
#format
theta = list(df_norm.columns[1:-1])
# theta.append(df_norm.columns[1])
# print(theta)
# colors=['D94F70','45B5AA','F0C05A','5A5B9F','9B1B30']
# Plot one subplot
def draw_one_nation(data, subplot):
# format_data=list(data[1:-1])
# format_data.append(format_data[0])
result_list = []
for index,k in enumerate(directions.keys()):
# r = data[directions[k]].iloc[i].values
r = data[directions[k]]
theta = directions[k].copy()
r = list(r)
prevkey = extra_theta[(index)%len(extra_theta)]
nextkey=extra_theta[(index+1)%len(extra_theta)]
r = [range_min+offset] + r + [range_min+offset]
theta = [prevkey] + theta + [nextkey]
result_list.append(go.Scatterpolar(
# name = data[0], # country name
subplot = subplot,
r = r,
theta = theta,
mode = "lines",
# fill = "toself",
# fillcolor = "rgba(180,120,150,0.2)",
line = dict(
shape = "spline",
# color = "rgba(180,120,150,0.4)",
color=colors[index],
)
)
)
return result_list
# Plot for group of continents
continents = df_norm.groupby('continent')
data = []
# ga_index = continents.groups['America']
# ga = continents.get_group('America')
i = 0
for continent, nations in continents:
subplot = "polar"
if i != 0:
subplot += str(i+1)
i+=1
for i_nation in range(len(nations)):
nation = nations.iloc[i_nation]
data+=(draw_one_nation(nation, subplot))
#Add a blank angle
data.append(go.Scatterpolar(
# name = df_norm['indicator'].iloc[i], #Nation as the trace name
subplot = subplot,
mode = "lines",
marker = dict(
color = "rgba(180,120,150,0)",
),
r=[0,0,0,0,0],
theta=['f','g','h','i','j'],
line = dict(
# shape = "spline",
color=[colors[index]],
smoothing = 1.3
),
hoverlabel=dict(
namelength=20
)
)
)
# Subplot format
total = 5
cols = 2
rows = total//cols+1
width = 1/cols*0.95
height = 1/rows*0.95
w_offset = 1/cols*0
h_offset = 1/rows*0
polars = dict()
for i, continent in enumerate(continents):
subplot = "polar"
if i!=0:
subplot += str(i+1)
col = i % cols
row = i // cols
x = [(width+w_offset)*(col), width*(col+1) + w_offset*col]
y = [1-height*(row+1)-h_offset*row, 1-(height+h_offset)*(row)]
print(continent[0], col, row)
polars[subplot] = dict(
domain = dict(
x = x,
y = y
),
# angularaxis=dict(
# tickfont = dict(
# size = 10
# ),
# ticks = "outside",
# rotation = 90,
# direction = "clockwise"
# ),
# radialaxis = dict(
# visible = True,
# tickfont = dict(
# size = 6
# ),
# nticks = 6,
# range = [-3, 3]
# )
bgcolor=bgcolor,
sector=[119,420],
angularaxis=dict(
showline=False, # the grid's outline
visible=False,
showticklabels=False,
tickfont = dict(
size = 8
),
ticks = "outside",
# nticks = 30,
nticks=6,
direction = "clockwise",
rotation = 60,
# gridcolor = "rgb(133,134,136,0.4)",
gridcolor="rgba(255,255,255,0.5)",
gridwidth = 1
),
radialaxis=dict(
angle = 60,
showline=False, # the tick line
showticklabels=False,
visible=True,
nticks=4,
range = [range_min,range_max],
# gridcolor = "rgb(133,134,136)",
gridwidth = 4,
gridcolor = "white", #the tick line
)
)
layout = go.Layout(
showlegend = False,
# title = "Small Nation in different Continents ",
paper_bgcolor=bgcolor,
)
fig = go.Figure(data=data, layout=layout)
fig['layout'].update(polars)
fig['layout'].update(width=max(cols*600,800),height=rows*500)
iplot(fig)
Africa 0 0 America 1 0 Asia 0 1 Europe 1 1 Oceania 0 2