#!/usr/bin/env python # coding: utf-8 # # Gapminder Data Analysis # # The dataset used for this notebook was obtained from [gapminder.org](http://www.gapminder.org/data/) with the following information: # # * Aged 15+ Employment Rate (%) # * Life Expectancy (years) # * GDP/capita (US$, inflation adjusted) # * Primary school completion (% of boys) # * Primary school completion (% of girls) # # ###### Summary # * 1) Import and clean data # * import data # * worldwide data # * canadian data # # * 2) Explore Data # * summary statistics canada vs worldwide # * employment rate # * life expectancy # * GDP # * male school completion # * female school completion # * correlations canada vs worldwide # * life expectancy and GDP # * gender and school completion # * compare canada and worldwide data # * standardizing the data point # * time series graphs # # * 3) Conclusions # # # 1) Import and Clean Data # # ###### Import Data # In[1]: # Libraries import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt get_ipython().run_line_magic('pylab', 'inline') # Read worldwide data into Pandas DataFrames with index column as country employment = pd.read_csv('employment_above_15.csv', index_col='Country') female_completion = pd.read_csv('female_completion_rate.csv', index_col='Country') male_completion = pd.read_csv('male_completion_rate.csv', index_col='Country') life_expectancy = pd.read_csv('life_expectancy.csv', index_col='Country') gdp = pd.read_csv('gdp_per_capita.csv', index_col='Country') # Data specific to index Canada: # Use .loc to look up values in pd series by index and .iloc for position employment_ca = employment.loc['Canada'].round(2) # Drop NAN from the male/female data series female_completion_ca = female_completion.loc['Canada'].dropna().round(2) male_completion_ca = male_completion.loc['Canada'].dropna().round(2) life_expectancy_ca = life_expectancy.loc['Canada'].round(2) gdp_ca = gdp.loc['Canada'].round(2) # #### Worldwide Data # In[2]: # Print all countries avaliable in worldwide data print employment.index.values print "Number of countries in dataset: ",len(employment.index.values) # From this data analysis, we will be using data from 178 countries to represent "worldwide" trends which should be sufficent. # #### Canadian Data # # When we were cleaning the data, we removed all the NAN values from male and female school completion rates which left us with only 4 data points for Canada. All floating point data types. # In[3]: print employment_ca.index.values print "Employment data points for Canada:", len(employment_ca.index.values) employment_ca.dtype # In[4]: print female_completion_ca.values female_completion_ca.dtype # In[5]: print male_completion_ca.values male_completion_ca.dtype # In[6]: print life_expectancy_ca.values print "Life expectancy data points for Canada:", len(life_expectancy_ca.index.values) life_expectancy_ca.dtype # In[7]: print gdp_ca.values print "GDP data points for Canada:", len(gdp_ca.index.values) gdp_ca.dtype # # Explore Data # # #### Summary Statistics Canada vs Worldwide # In[18]: def sum_stats(index): sum_mean = index.mean() sum_std = index.std() sum_max = index.max() sum_sum = index.sum() print "Mean:", sum_mean print "Standard deviation:", sum_std print "Max:", sum_max print "Sum:", sum_sum # Canada print "Employment CA:", sum_stats(employment_ca) print "Female school completion rates CA:", sum_stats(female_completion_ca) print "Male school completion rates CA:", sum_stats(male_completion_ca) print "Life expectancy CA:", sum_stats(life_expectancy_ca) print "GDP CA:", sum_stats(gdp_ca) # Worldwide ''' print "Employment worldwide:", sum_stats(employment) print "Female school completion rates worldwide:", sum_stats(female_completion) print "Male school completion rates worldwide:", sum_stats(male_completion) print "Life expectancy worldwide:", sum_stats(life_expectancy) print "GDP worldwide:", sum_stats(gdp) ''' # #### Worldwide: The countries with the highest and lowest employment rates # In[23]: # Instead of using data from all 178 countries, we will sample the first 20 countries countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia'] employment_values = [ 55.70000076, 51.40000153, 50.5 , 75.69999695, 58.40000153, 40.09999847, 61.5 , 57.09999847, 60.90000153, 66.59999847, 60.40000153, 68.09999847, 66.90000153, 53.40000153, 48.59999847, 56.79999924, 71.59999847, 58.40000153, 70.40000153, 41.20000076, ] life_expectancy_values = [74.7, 75. , 83.4, 57.6, 74.6, 75.4, 72.3, 81.5, 80.2, 70.3, 72.1, 76.4, 68.1, 75.2, 69.8, 79.4, 70.8, 62.7, 67.3, 70.6] gdp_values = [ 1681.61390973, 2155.48523109, 21495.80508273, 562.98768478, 13495.1274663 , 9388.68852258, 1424.19056199, 24765.54890176, 27036.48733192, 1945.63754911, 21721.61840978, 13373.21993972, 483.97086804, 9783.98417323, 2253.46411147, 25034.66692293, 3680.91642923, 366.04496652, 1175.92638695, 1132.21387981] female_completion = np.array([ 97.35583, 104.62379, 103.02998, 95.14321, 103.69019, 98.49185, 100.88828, 95.43974, 92.11484, 91.54804, 95.98029, 98.22902, 96.12179, 119.28105, 97.84627, 29.07386, 38.41644, 90.70509, 51.7478 , 95.45072 ]) male_completion = np.array([ 95.47622, 100.66476, 99.7926 , 91.48936, 103.22096, 97.80458, 103.81398, 88.11736, 93.55611, 87.76347, 102.45714, 98.73953, 92.22388, 115.3892 , 98.70502, 37.00692, 45.39401, 91.22084, 62.42028, 90.66958 ]) employment = pd.Series(employment_values, index=countries) life_expectancy = pd.Series(life_expectancy_values, index=countries) gdp_completion = pd.Series(gdp_values, index=countries) female_completion = pd.Series(female_completion, index=countries) male_completion = pd.Series(male_completion, index=countries) # In[33]: def max_variable(variable): max_country = variable.argmax() # argmax returns index of maximum value max_value = variable.loc[max_country] # get index value return (max_country, max_value) def min_variable(variable): min_country = variable.argmin() min_value = variable.loc[min_country] return (min_country, min_value) print max_variable(employment), min_variable(employment) print max_variable(life_expectancy), min_variable(life_expectancy) print max_variable(gdp_completion), min_variable(gdp_completion) # print max_variable(female_completion), min_variable(female_completion) # print max_variable(male_completion), min_variable(male_completion) # ### Gender and School Completion # The overall school completion rate in each country and how it relates to gender. # In[32]: print max_variable(female_completion), min_variable(female_completion) print max_variable(male_completion), min_variable(male_completion) # Calculate the overall completion rate assuming 50/50 gender ratio def overall_completion_rate(female_completion, male_completion): return (female_completion + male_completion)/2 overall_completion_rate(female_completion, male_completion).round(2) # ### Standardizing Data # # We can standardize data for each country by comparing the single data point to the rest of the data points (first 20 countries). Specifically, we look at how a variable in Canada compares to other countries. We can convert each data point to the number of standard deviations from the mean. A positive value represents that a data point is above the mean, and a negative value represents that data point below the mean. # In[39]: # Function takes in values array and standardizes the values def standardize_data(values): standardized_values = (values - values.mean()) / values.std() return standardized_values # Outputs a numpy array # In[40]: print standardize_data(employment) # In[41]: print standardize_data(female_completion) # In[42]: print standardize_data(male_completion) # In[43]: print standardize_data(life_expectancy) # In[51]: # print standardize_data(gdp) # ### Plots of Variables Over Time # # Panda series were used to create a plot of each variable over time for Canada. The variables include employment rates, female school completion rate, male school completion rate, life expectancy, and gdp. # In[59]: employment_ca.plot() plt.ylabel('Employment rate') # In[60]: female_completion_ca.plot() plt.ylabel('Female school completion rate') # In[61]: male_completion_ca.plot() plt.ylabel('Male school completion rate') # In[56]: life_expectancy_ca.plot() plt.ylabel('Life expectancy rate') # In[55]: gdp_ca.plot() plt.ylabel('GDP in the United States') # ### GDP and Life Expectancy # In[ ]: # Import Data countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia'] life_expectancy_values = [74.7, 75. , 83.4, 57.6, 74.6, 75.4, 72.3, 81.5, 80.2, 70.3, 72.1, 76.4, 68.1, 75.2, 69.8, 79.4, 70.8, 62.7, 67.3, 70.6] gdp_values = [ 1681.61390973, 2155.48523109, 21495.80508273, 562.98768478, 13495.1274663 , 9388.68852258, 1424.19056199, 24765.54890176, 27036.48733192, 1945.63754911, 21721.61840978, 13373.21993972, 483.97086804, 9783.98417323, 2253.46411147, 25034.66692293, 3680.91642923, 366.04496652, 1175.92638695, 1132.21387981] life_expectancy = pd.Series(life_expectancy_values) gdp = pd.Series(gdp_values) # In[ ]: # Life expectancy summary statistics print "Mean life expectancy:", life_expectancy.mean() print "Standard deviation of the mean life expectancy:", round(life_expectancy.std(),2) print "Max life expectancy:", round(life_expectancy.max(),2) print "Sum life expectancy:", round(life_expectancy.sum(),2) print # GDP summary statistics print "Mean GDP:", round(gdp.mean(),2) print "Standard deviation of the mean GDP:", round(gdp.std(),2) print "Max GDP:", round(gdp.max(),2) print "Sum GDP:", round(gdp.sum(),2) # Looking for the coorelation between two panda series where: # * variable1 : The number of countries where both values are above or both are below the mean # * variable2 : The number of countries where one value is above and one is below the mean # # In[ ]: def variable_correlation(variable1, variable2): ''' Returns number of data points for when directions of variable1 and variable2 relative to the mean are the same or different. Direction here means whether each value is above or below its mean. ''' # Data points in the same/different direction as boolean both_above = (variable1 > variable1.mean()) & (variable2 > variable2.mean()) both_below = (variable1 < variable1.mean()) & (variable2 < variable2.mean()) # If value is equal to the mean is_same_direction = both_above | both_below # Add up array of booleans to count number of Trues num_same_direction = is_same_direction.sum() num_different_direction = len(variable1) - num_same_direction return (num_same_direction, num_different_direction) # Correlation between life expectancy and GDP variable_correlation(life_expectancy, gdp) # #### Positive correlation between GDP and Life Expectancy # For our sample set, 17 pairs of data points are going in the same direction, and 3 pairs of data are going in the opposite direction. Since the first number is large and the second number is small, there is a positive correlation. If one number is large whereas the other number was small, then there would be a negative coorelation. If the two numbers were equal, then there would be no correlation.