#!/usr/bin/env python
# coding: utf-8

# Internet use and religion in Europe, part two
# -----------------------------------------
# 
# This notebook presents a second-pass analysis of the association between Internet use and religion in Europe, using data from the European Social Survey (http://www.europeansocialsurvey.org).
# 
# Copyright 2015 Allen Downey
# 
# MIT License: http://opensource.org/licenses/MIT

# In[1]:


from __future__ import print_function, division

import numpy as np
import pandas as pd

import thinkstats2
import thinkplot

import statsmodels.formula.api as smf

from iso_country_codes import COUNTRY

get_ipython().run_line_magic('matplotlib', 'inline')


# The following function selects the columns I need.

# In[2]:


def read_cycle(filename):
    """Reads a file containing ESS data and selects columns.
    
    filename: string
    
    returns: DataFrame
    """ 
    df = pd.read_stata(filename, convert_categoricals=False)

    if 'hinctnta' not in df.columns:
        df['hinctnta'] = df.hinctnt
        
    if 'inwyr' not in df.columns:
        df['inwyr'] = df.inwyye
        
    cols = ['cntry', 'inwyr', 'tvtot', 'tvpol', 'rdtot', 'rdpol', 
            'nwsptot', 'nwsppol', 'netuse', 
            'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 'yrbrn', 
            'eisced', 'pspwght', 'pweight']
    df = df[cols]
    return df


# Read data from Cycle 1.

# In[3]:


df1 = read_cycle('ESS1e06_4.dta')
df1.head()


# Read data from Cycle 2.

# In[4]:


df2 = read_cycle('ESS2e03_4.dta')
df2.head()


# Read data from Cycle 3.

# In[5]:


df3 = read_cycle('ESS3e03_5.dta')
df3.head()


# Read data from Cycle 4.

# In[6]:


df4 = read_cycle('ESS4e04_3.dta')
df4.head()


# Read data from Cycle 5.

# In[7]:


df5 = read_cycle('ESS5e03_2.dta')
df5.head()


# In[8]:


def clean_cycle(df):
    """Cleans data from one cycle.
    
    df: DataFrame
    """
    df.tvtot.replace([77, 88, 99], np.nan, inplace=True)
    df.rdtot.replace([77, 88, 99], np.nan, inplace=True)
    df.nwsptot.replace([77, 88, 99], np.nan, inplace=True)
    df.netuse.replace([77, 88, 99], np.nan, inplace=True)
    df.tvpol.replace([66, 77, 88, 99], np.nan, inplace=True)
    df.rdpol.replace([66, 77, 88, 99], np.nan, inplace=True)
    df.nwsppol.replace([66, 77, 88, 99], np.nan, inplace=True)
    df.eduyrs.replace([77, 88, 99], np.nan, inplace=True)
    df.rlgblg.replace([7, 8, 9], np.nan, inplace=True)
    df.rlgdgr.replace([77, 88, 99], np.nan, inplace=True)
    df.hinctnta.replace([77, 88, 99], np.nan, inplace=True)
    df.yrbrn.replace([7777, 8888, 9999], np.nan, inplace=True)
    df.inwyr.replace([9999], np.nan, inplace=True)
    
    df['hasrelig'] = (df.rlgblg==1).astype(int)
    df.loc[df.rlgblg.isnull(), 'hasrelig'] = np.nan
    
    df['yrbrn60'] = df.yrbrn - 1960
    df['inwyr07'] = df.inwyr - 2007 + np.random.uniform(-0.5, 0.5, len(df))


# In[9]:


cycles = [df1, df2, df3, df4, df5]
for cycle in cycles:
    clean_cycle(cycle)


# In[10]:


def resample(df):
    """Resample data by country.
    
    df: DataFrame
    
    returns: map from country code to DataFrame
    """
    res = {}
    grouped = df.groupby('cntry')
    for name, group in grouped:
        sample = group.sample(len(group), weights=group.pspwght, replace=True)
        sample.index = range(len(group))
        res[name] = sample
    return res

# each cycle_map is a map from country code to DataFrame
cycle_maps = [resample(cycle) for cycle in cycles]
for cycle_map in cycle_maps:
    print(len(cycle_map), 'countries')


# In a each cycle, a few questions were not asked in some countries.

# In[11]:


def check_variables(name, group):
    """Print variables missing from a group.
    
    name: group name (country code)
    group: DataFrame
    """
    varnames = ['cntry', 'tvtot', 'tvpol', 'rdtot', 'rdpol', 
                'nwsptot', 'nwsppol', 'netuse', 'inwyr07', 
                'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 
                'yrbrn', 'pspwght', 'pweight']
    for var in varnames:
        n = len(group[var].dropna())
        if (n < 100):
            print(name, var, len(group[var].dropna()))

for i, cycle_map in enumerate(cycle_maps):
    print('Cycle', i+1)
    for name, group in cycle_map.items():
        check_variables(name, group)


# I'll remove the cycle/country groups that are missing netuse or rlgblg.
# 
# The ones that are missing income information undermine the ability to control for income, but it's a pretty weak effect, so I don't think it's a problem.
# 
# inwtr07 is interview year shifted by 1970, the approximate mean.

# In[12]:


del cycle_maps[0]['FR']
del cycle_maps[0]['DE']
del cycle_maps[1]['FR']
del cycle_maps[1]['FI']

ee = cycle_maps[4]['EE']
ee.inwyr07 = 3 + np.random.uniform(-0.5, 0.5, len(ee))


# Income is reported on a different scale in different cycles, and differs substantially across countries.  So I'm replacing it with rank on a per-country basis: that is, each respondent is mapped to their income rank, from 0 to 1, within their country.
# 
# I do the same thing with years of education, in part because there are some suspiciously high values.  At the very least, mapping to rank reduces the effect of outliers.

# In[13]:


def replace_var_with_rank(name, df, old, new):
    """Replaces a scale variable with a rank from 0-1.
    
    Creates a new column.
    
    name: country code
    df: DataFrame
    old: old variable name
    new: new variable name
    """
    # jitter the data
    series = df[old] + np.random.uniform(-0.25, 0.25, len(df))
    
    # if there's no data, just put in random values
    if len(series.dropna()) < 10:
        df[new] = np.random.random(len(df))
        return
    
    # map from values to ranks
    cdf = thinkstats2.Cdf(series)
    df[new] = cdf.Probs(series)
    
    # make sure NaN maps to NaN
    df.loc[df[old].isnull(), new] = np.nan
    
    
def replace_with_ranks(cycle_map):
    """Replace variables within countries.
    
    cycle_map: map from country code to DataFrame
    """
    for name, group in cycle_map.items():
        replace_var_with_rank(name, group, 'hinctnta', 'hincrank')
        replace_var_with_rank(name, group, 'eduyrs', 'edurank')
        
for cycle_map in cycle_maps:
    replace_with_ranks(cycle_map)


# To fill missing values, I am drawing random samples from the available values, on a per-country basis.
# 
# Since I am not modeling relationships between variables, I am losing some information with this replacement policy; the net effect is to understate the actual strength of all relationships.

# In[14]:


def fill_var(df, old, new):
    """Fills missing values.
    
    Creates a new column
    
    df: DataFrame
    old: old variable name
    new: new variable name
    """
    # find the NaN rows
    null = df[df[old].isnull()]
    
    # sample from the non-NaN rows
    fill = df[old].dropna().sample(len(null), replace=True)
    fill.index = null.index
    
    # replace NaNs with the random sample
    df[new] = df[old].fillna(fill)
    
    
def fill_all_vars(df):
    """Fills missing values in the variables we need.
    
    df: DataFrame
    """
    for old in ['hasrelig', 'rlgdgr', 'yrbrn60', 'edurank', 'hincrank',
           'tvtot', 'rdtot', 'nwsptot', 'netuse', 'inwyr07']:
        new = old + '_f'
        fill_var(df, old, new)


# In[15]:


def fill_vars_by_country(cycle_map):
    for name, group in cycle_map.items():
        fill_all_vars(group)
        
for cycle_map in cycle_maps:
    fill_vars_by_country(cycle_map)


# Concatenate the cycles.

# In[16]:


def concat_groups(cycle_map):
    """Concat all countries in a cycle.
    
    cycle_map: map from country code to DataFrame
    
    returns: DataFrame
    """
    return pd.concat(cycle_map.values(), ignore_index=True)

dfs = [concat_groups(cycle_map) for cycle_map in cycle_maps]

for df in dfs:
    print(len(df))


# In[17]:


df = pd.concat(dfs, ignore_index=True)
print(df.shape)
df.head()


# TV watching time on average weekday

# In[18]:


df.tvtot.value_counts().sort_index()


# Radio listening, total time on average weekday.

# In[19]:


df.rdtot.value_counts().sort_index()


# Newspaper reading, total time on average weekday.

# In[20]:


df.nwsptot.value_counts().sort_index()


# Personal use of Internet, email, www

# In[21]:


df.netuse.value_counts().sort_index()


# Belong to a particular religion or denomination

# In[22]:


df.rlgblg.value_counts().sort_index()


# How religious

# In[23]:


df.rlgdgr.value_counts().sort_index()


# Total household net income, all sources

# In[24]:


df.hincrank.describe()


# Year born

# In[25]:


df.yrbrn.describe()


# In[26]:


cdf = thinkstats2.Cdf(df.yrbrn)
thinkplot.PrePlot(1)
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Year born', ylabel='CDF', 
                 title='Disrtribution of year born', legend=False)


# Shifted to mean near 0

# In[27]:


df.yrbrn60.describe()


# Number of years of education, converted to ranks.

# In[28]:


df.edurank.describe()


# Country codes

# In[29]:


df.cntry.value_counts().sort_index()


# In[30]:


df.rlgdgr.value_counts().sort_index()


# In[31]:


df.inwyr07.describe()


# Run the model

# In[32]:


def run_model(df, formula):
    model = smf.logit(formula, data=df)    
    results = model.fit(disp=False)
    return results


# Here's the model with all control variables and all media variables:

# In[33]:


formula = ('hasrelig ~ yrbrn60 + edurank + hincrank +'
           'tvtot + rdtot + nwsptot + netuse')
res = run_model(df, formula)
res.summary()


# Now using the filled variables

# In[34]:


formula = ('hasrelig_f ~ yrbrn60_f + edurank_f + hincrank_f +'
           'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
res = run_model(df, formula)
res.summary()


# Now adding inwyr07

# In[35]:


formula = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +'
           'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
res = run_model(df, formula)
res.summary()


# In[36]:


def extract_res(res, var='netuse_f'):
    param = res.params[var]
    pvalue = res.pvalues[var]
    stars = '**' if pvalue < 0.01 else '*' if pvalue < 0.05 else ''
    return res.nobs, param, stars

extract_res(res)


# Group by country:

# In[37]:


grouped = df.groupby('cntry')


# Run a sample country

# In[38]:


group = grouped.get_group('IS')
run_model(group, formula).summary()


# Run all countries

# In[39]:


def run_logits(grouped, formula, var):
    for name, group in grouped:
        country = '%14.14s' % COUNTRY[name]
        model = smf.logit(formula, data=group)    
        results = model.fit(disp=False)
        nobs, param, stars = extract_res(results, var=var)
        arrow = '<--' if stars and param > 0 else ''
        print(country, nobs, '%0.3g'%param, stars, arrow, sep='\t')


# In[40]:


formula = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +'
           'tvtot_f + rdtot_f + nwsptot_f + netuse_f')

run_logits(grouped, formula, 'netuse_f')


# In[41]:


run_logits(grouped, formula, 'hincrank_f')


# In[42]:


run_logits(grouped, formula, 'edurank_f')


# Run OLS model with `rlgdgr`: "Regardless of whether you belong to a particular religion, how religious would you say you are?"

# In[43]:


formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +'
           'tvtot_f + rdtot_f + nwsptot_f + netuse_f')
model = smf.ols(formula, data=df)    
results = model.fit(disp=False)
results.summary()


# In[44]:


def run_ols(grouped, formula, var):
    for name, group in grouped:
        model = smf.ols(formula, data=group)    
        results = model.fit(disp=False)
        nobs, param, stars = extract_res(results, var=var)
        arrow = '<--' if stars and param > 0 else ''
        print(name, len(group), '%0.3g    '%param, stars, arrow, sep='\t')


# In[45]:


run_ols(grouped, formula, 'netuse_f')


# In[46]:


run_ols(grouped, formula, 'edurank_f')


# In[47]:


run_ols(grouped, formula, 'hincrank_f')


# Let's see what happens if we add quadratic terms for edurank and hincrank:

# In[48]:


df['edurank_f2'] = df.edurank_f**2
df['hincrank_f2'] = df.hincrank_f**2


# In[49]:


formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + edurank_f2 + hincrank_f +'
           'tvtot_f + rdtot_f + nwsptot_f + netuse_f')


# In[50]:


run_ols(grouped, formula, 'edurank_f')


# In[51]:


run_ols(grouped, formula, 'edurank_f2')


# In[52]:


formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + edurank_f2 + '
           'hincrank_f + hincrank_f2 + '
           'tvtot_f + rdtot_f + nwsptot_f + netuse_f')


# In[53]:


run_ols(grouped, formula, 'hincrank_f')


# In[54]:


run_ols(grouped, formula, 'edurank_f')


# In[55]:


run_ols(grouped, formula, 'netuse_f')


# In[ ]: