#!/usr/bin/env python
# coding: utf-8

# # Gapminder Data Analysis
# 
# The dataset used for this notebook was obtained from [gapminder.org](http://www.gapminder.org/data/) with the following information:
# 
# * Aged 15+ Employment Rate (%)
# * Life Expectancy (years)
# * GDP/capita (US$, inflation adjusted)
# * Primary school completion (% of boys)
# * Primary school completion (% of girls)
# 
# ###### Summary 
# * 1) Import and clean data
#     * import data 
#     * worldwide data
#     * canadian data
#    
# * 2) Explore Data
#     * summary statistics canada vs worldwide
#         * employment rate 
#         * life expectancy 
#         * GDP
#         * male school completion
#         * female school completion 
#     * correlations canada vs worldwide
#         * life expectancy and GDP 
#         * gender and school completion 
#     * compare canada and worldwide data 
#         * standardizing the data point
#         * time series graphs
#   
# * 3) Conclusions
# 
# # 1) Import and Clean Data
# 
# ###### Import Data

# In[1]:


# Libraries 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
get_ipython().run_line_magic('pylab', 'inline')

# Read worldwide data into Pandas DataFrames with index column as country 
employment = pd.read_csv('employment_above_15.csv', index_col='Country')
female_completion = pd.read_csv('female_completion_rate.csv', index_col='Country')
male_completion = pd.read_csv('male_completion_rate.csv', index_col='Country')
life_expectancy = pd.read_csv('life_expectancy.csv', index_col='Country')
gdp = pd.read_csv('gdp_per_capita.csv', index_col='Country')

# Data specific to index Canada:
# Use .loc to look up values in pd series by index and .iloc for position 
employment_ca = employment.loc['Canada'].round(2)
# Drop NAN from the male/female data series 
female_completion_ca = female_completion.loc['Canada'].dropna().round(2)
male_completion_ca = male_completion.loc['Canada'].dropna().round(2)
life_expectancy_ca = life_expectancy.loc['Canada'].round(2)
gdp_ca = gdp.loc['Canada'].round(2)


# #### Worldwide Data

# In[2]:


# Print all countries avaliable in worldwide data 
print employment.index.values
print "Number of countries in dataset: ",len(employment.index.values)


# From this data analysis, we will be using data from 178 countries to represent "worldwide" trends which should be sufficent.

# #### Canadian Data
# 
# When we were cleaning the data, we removed all the NAN values from male and female school completion rates which left us with only 4 data points for Canada. All floating point data types.

# In[3]:


print employment_ca.index.values
print "Employment data points for Canada:", len(employment_ca.index.values)
employment_ca.dtype


# In[4]:


print female_completion_ca.values
female_completion_ca.dtype


# In[5]:


print male_completion_ca.values
male_completion_ca.dtype


# In[6]:


print life_expectancy_ca.values
print "Life expectancy data points for Canada:", len(life_expectancy_ca.index.values)
life_expectancy_ca.dtype


# In[7]:


print gdp_ca.values
print "GDP data points for Canada:", len(gdp_ca.index.values)
gdp_ca.dtype


# # Explore Data 
# 
# #### Summary Statistics Canada vs Worldwide

# In[18]:


def sum_stats(index):
    sum_mean = index.mean()
    sum_std = index.std()
    sum_max = index.max()
    sum_sum = index.sum()
    print "Mean:", sum_mean
    print "Standard deviation:", sum_std
    print "Max:", sum_max
    print "Sum:", sum_sum

# Canada 
print "Employment CA:", sum_stats(employment_ca)
print "Female school completion rates CA:", sum_stats(female_completion_ca)
print "Male school completion rates CA:", sum_stats(male_completion_ca)
print "Life expectancy CA:", sum_stats(life_expectancy_ca)
print "GDP CA:", sum_stats(gdp_ca)

# Worldwide 
''' 
print "Employment worldwide:", sum_stats(employment)
print "Female school completion rates worldwide:", sum_stats(female_completion)
print "Male school completion rates worldwide:", sum_stats(male_completion)
print "Life expectancy worldwide:", sum_stats(life_expectancy)
print "GDP worldwide:", sum_stats(gdp)
''' 


# #### Worldwide: The countries with the highest and lowest employment rates

# In[23]:


# Instead of using data from all 178 countries, we will sample the first 20 countries

countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
             'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
             'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
             'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']

employment_values = [
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076,
]

life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]

female_completion = np.array([
    97.35583,  104.62379,  103.02998,   95.14321,  103.69019,
    98.49185,  100.88828,   95.43974,   92.11484,   91.54804,
    95.98029,   98.22902,   96.12179,  119.28105,   97.84627,
    29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])

male_completion = np.array([
     95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,
     97.80458,  103.81398,   88.11736,   93.55611,   87.76347,
    102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,
     37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])


employment = pd.Series(employment_values, index=countries)
life_expectancy = pd.Series(life_expectancy_values, index=countries)
gdp_completion = pd.Series(gdp_values, index=countries)
female_completion = pd.Series(female_completion, index=countries)
male_completion = pd.Series(male_completion, index=countries)


# In[33]:


def max_variable(variable):
    max_country = variable.argmax()           # argmax returns index of maximum value 
    max_value = variable.loc[max_country]     # get index value
    return (max_country, max_value)

def min_variable(variable):
    min_country = variable.argmin()
    min_value = variable.loc[min_country]
    return (min_country, min_value)

print max_variable(employment), min_variable(employment)
print max_variable(life_expectancy), min_variable(life_expectancy)
print max_variable(gdp_completion), min_variable(gdp_completion)
# print max_variable(female_completion), min_variable(female_completion)
# print max_variable(male_completion), min_variable(male_completion)


# ### Gender and School Completion 
# The overall school completion rate in each country and how it relates to gender.  

# In[32]:


print max_variable(female_completion), min_variable(female_completion)
print max_variable(male_completion), min_variable(male_completion)

# Calculate the overall completion rate assuming 50/50 gender ratio 
def overall_completion_rate(female_completion, male_completion):
    return (female_completion + male_completion)/2

overall_completion_rate(female_completion, male_completion).round(2)


# ### Standardizing Data 
# 
# We can standardize data for each country by comparing the single data point to the rest of the data points (first 20 countries). Specifically, we look at how a variable in Canada compares to other countries. We can convert each data point to the number of standard deviations from the mean. A positive value represents that a data point is above the mean, and a negative value represents that data point below the mean.

# In[39]:


# Function takes in values array and standardizes the values
def standardize_data(values):
    standardized_values = (values - values.mean()) / values.std()
    return standardized_values  # Outputs a numpy array 


# In[40]:


print standardize_data(employment)


# In[41]:


print standardize_data(female_completion)


# In[42]:


print standardize_data(male_completion)


# In[43]:


print standardize_data(life_expectancy)


# In[51]:


# print standardize_data(gdp)


# ### Plots of Variables Over Time
# 
# Panda series were used to create a plot of each variable over time for Canada. The variables include employment rates, female school completion rate, male school completion rate, life expectancy, and gdp. 

# In[59]:


employment_ca.plot()
plt.ylabel('Employment rate')


# In[60]:


female_completion_ca.plot()
plt.ylabel('Female school completion rate')


# In[61]:


male_completion_ca.plot()
plt.ylabel('Male school completion rate')


# In[56]:


life_expectancy_ca.plot()
plt.ylabel('Life expectancy rate')


# In[55]:


gdp_ca.plot()
plt.ylabel('GDP in the United States')


# ### GDP and Life Expectancy

# In[ ]:


# Import Data
countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
             'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
             'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
             'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']

life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]

life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)


# In[ ]:


# Life expectancy summary statistics 
print "Mean life expectancy:", life_expectancy.mean()
print "Standard deviation of the mean life expectancy:", round(life_expectancy.std(),2)
print "Max life expectancy:", round(life_expectancy.max(),2)
print "Sum life expectancy:", round(life_expectancy.sum(),2)
print

# GDP summary statistics 
print "Mean GDP:", round(gdp.mean(),2)
print "Standard deviation of the mean GDP:", round(gdp.std(),2)
print "Max GDP:", round(gdp.max(),2)
print "Sum GDP:", round(gdp.sum(),2)


# Looking for the coorelation between two panda series where:
# * variable1 : The number of countries where both values are above or both are below the mean 
# * variable2 : The number of countries where one value is above and one is below the mean 
# 

# In[ ]:


def variable_correlation(variable1, variable2):
    '''
    Returns number of data points for when directions of variable1 
    and variable2 relative to the mean are the same or different. Direction 
    here means whether each value is above or below its mean.
    '''
    # Data points in the same/different direction as boolean 
    both_above = (variable1 > variable1.mean()) & (variable2 > variable2.mean())
    both_below = (variable1 < variable1.mean()) & (variable2 < variable2.mean())
    
    # If value is equal to the mean 
    is_same_direction = both_above | both_below 
    # Add up array of booleans to count number of Trues 
    num_same_direction = is_same_direction.sum() 
    
    num_different_direction = len(variable1) - num_same_direction
    
    return (num_same_direction, num_different_direction)

# Correlation between life expectancy and GDP
variable_correlation(life_expectancy, gdp)


# #### Positive correlation between GDP and Life Expectancy 
# For our sample set, 17 pairs of data points are going in the same direction, and 3 pairs of data are going in the opposite direction. Since the first number is large and the second number is small, there is a positive correlation. If one number is large whereas the other number was small, then there would be a negative coorelation. If the two numbers were equal, then there would be no correlation.