#!/usr/bin/env python # coding: utf-8 # Internet use and religion in Europe, part two # ----------------------------------------- # # This notebook presents a second-pass analysis of the association between Internet use and religion in Europe, using data from the European Social Survey (http://www.europeansocialsurvey.org). # # Copyright 2015 Allen Downey # # MIT License: http://opensource.org/licenses/MIT # In[1]: from __future__ import print_function, division import numpy as np import pandas as pd import thinkstats2 import thinkplot import statsmodels.formula.api as smf from iso_country_codes import COUNTRY get_ipython().run_line_magic('matplotlib', 'inline') # The following function selects the columns I need. # In[2]: def read_cycle(filename): """Reads a file containing ESS data and selects columns. filename: string returns: DataFrame """ df = pd.read_stata(filename, convert_categoricals=False) if 'hinctnta' not in df.columns: df['hinctnta'] = df.hinctnt if 'inwyr' not in df.columns: df['inwyr'] = df.inwyye cols = ['cntry', 'inwyr', 'tvtot', 'tvpol', 'rdtot', 'rdpol', 'nwsptot', 'nwsppol', 'netuse', 'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 'yrbrn', 'eisced', 'pspwght', 'pweight'] df = df[cols] return df # Read data from Cycle 1. # In[3]: df1 = read_cycle('ESS1e06_4.dta') df1.head() # Read data from Cycle 2. # In[4]: df2 = read_cycle('ESS2e03_4.dta') df2.head() # Read data from Cycle 3. # In[5]: df3 = read_cycle('ESS3e03_5.dta') df3.head() # Read data from Cycle 4. # In[6]: df4 = read_cycle('ESS4e04_3.dta') df4.head() # Read data from Cycle 5. # In[7]: df5 = read_cycle('ESS5e03_2.dta') df5.head() # In[8]: def clean_cycle(df): """Cleans data from one cycle. df: DataFrame """ df.tvtot.replace([77, 88, 99], np.nan, inplace=True) df.rdtot.replace([77, 88, 99], np.nan, inplace=True) df.nwsptot.replace([77, 88, 99], np.nan, inplace=True) df.netuse.replace([77, 88, 99], np.nan, inplace=True) df.tvpol.replace([66, 77, 88, 99], np.nan, inplace=True) df.rdpol.replace([66, 77, 88, 99], np.nan, inplace=True) df.nwsppol.replace([66, 77, 88, 99], np.nan, inplace=True) df.eduyrs.replace([77, 88, 99], np.nan, inplace=True) df.rlgblg.replace([7, 8, 9], np.nan, inplace=True) df.rlgdgr.replace([77, 88, 99], np.nan, inplace=True) df.hinctnta.replace([77, 88, 99], np.nan, inplace=True) df.yrbrn.replace([7777, 8888, 9999], np.nan, inplace=True) df.inwyr.replace([9999], np.nan, inplace=True) df['hasrelig'] = (df.rlgblg==1).astype(int) df.loc[df.rlgblg.isnull(), 'hasrelig'] = np.nan df['yrbrn60'] = df.yrbrn - 1960 df['inwyr07'] = df.inwyr - 2007 + np.random.uniform(-0.5, 0.5, len(df)) # In[9]: cycles = [df1, df2, df3, df4, df5] for cycle in cycles: clean_cycle(cycle) # In[10]: def resample(df): """Resample data by country. df: DataFrame returns: map from country code to DataFrame """ res = {} grouped = df.groupby('cntry') for name, group in grouped: sample = group.sample(len(group), weights=group.pspwght, replace=True) sample.index = range(len(group)) res[name] = sample return res # each cycle_map is a map from country code to DataFrame cycle_maps = [resample(cycle) for cycle in cycles] for cycle_map in cycle_maps: print(len(cycle_map), 'countries') # In a each cycle, a few questions were not asked in some countries. # In[11]: def check_variables(name, group): """Print variables missing from a group. name: group name (country code) group: DataFrame """ varnames = ['cntry', 'tvtot', 'tvpol', 'rdtot', 'rdpol', 'nwsptot', 'nwsppol', 'netuse', 'inwyr07', 'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 'yrbrn', 'pspwght', 'pweight'] for var in varnames: n = len(group[var].dropna()) if (n < 100): print(name, var, len(group[var].dropna())) for i, cycle_map in enumerate(cycle_maps): print('Cycle', i+1) for name, group in cycle_map.items(): check_variables(name, group) # I'll remove the cycle/country groups that are missing netuse or rlgblg. # # The ones that are missing income information undermine the ability to control for income, but it's a pretty weak effect, so I don't think it's a problem. # # inwtr07 is interview year shifted by 1970, the approximate mean. # In[12]: del cycle_maps[0]['FR'] del cycle_maps[0]['DE'] del cycle_maps[1]['FR'] del cycle_maps[1]['FI'] ee = cycle_maps[4]['EE'] ee.inwyr07 = 3 + np.random.uniform(-0.5, 0.5, len(ee)) # Income is reported on a different scale in different cycles, and differs substantially across countries. So I'm replacing it with rank on a per-country basis: that is, each respondent is mapped to their income rank, from 0 to 1, within their country. # # I do the same thing with years of education, in part because there are some suspiciously high values. At the very least, mapping to rank reduces the effect of outliers. # In[13]: def replace_var_with_rank(name, df, old, new): """Replaces a scale variable with a rank from 0-1. Creates a new column. name: country code df: DataFrame old: old variable name new: new variable name """ # jitter the data series = df[old] + np.random.uniform(-0.25, 0.25, len(df)) # if there's no data, just put in random values if len(series.dropna()) < 10: df[new] = np.random.random(len(df)) return # map from values to ranks cdf = thinkstats2.Cdf(series) df[new] = cdf.Probs(series) # make sure NaN maps to NaN df.loc[df[old].isnull(), new] = np.nan def replace_with_ranks(cycle_map): """Replace variables within countries. cycle_map: map from country code to DataFrame """ for name, group in cycle_map.items(): replace_var_with_rank(name, group, 'hinctnta', 'hincrank') replace_var_with_rank(name, group, 'eduyrs', 'edurank') for cycle_map in cycle_maps: replace_with_ranks(cycle_map) # To fill missing values, I am drawing random samples from the available values, on a per-country basis. # # Since I am not modeling relationships between variables, I am losing some information with this replacement policy; the net effect is to understate the actual strength of all relationships. # In[14]: def fill_var(df, old, new): """Fills missing values. Creates a new column df: DataFrame old: old variable name new: new variable name """ # find the NaN rows null = df[df[old].isnull()] # sample from the non-NaN rows fill = df[old].dropna().sample(len(null), replace=True) fill.index = null.index # replace NaNs with the random sample df[new] = df[old].fillna(fill) def fill_all_vars(df): """Fills missing values in the variables we need. df: DataFrame """ for old in ['hasrelig', 'rlgdgr', 'yrbrn60', 'edurank', 'hincrank', 'tvtot', 'rdtot', 'nwsptot', 'netuse', 'inwyr07']: new = old + '_f' fill_var(df, old, new) # In[15]: def fill_vars_by_country(cycle_map): for name, group in cycle_map.items(): fill_all_vars(group) for cycle_map in cycle_maps: fill_vars_by_country(cycle_map) # Concatenate the cycles. # In[16]: def concat_groups(cycle_map): """Concat all countries in a cycle. cycle_map: map from country code to DataFrame returns: DataFrame """ return pd.concat(cycle_map.values(), ignore_index=True) dfs = [concat_groups(cycle_map) for cycle_map in cycle_maps] for df in dfs: print(len(df)) # In[17]: df = pd.concat(dfs, ignore_index=True) print(df.shape) df.head() # TV watching time on average weekday # In[18]: df.tvtot.value_counts().sort_index() # Radio listening, total time on average weekday. # In[19]: df.rdtot.value_counts().sort_index() # Newspaper reading, total time on average weekday. # In[20]: df.nwsptot.value_counts().sort_index() # Personal use of Internet, email, www # In[21]: df.netuse.value_counts().sort_index() # Belong to a particular religion or denomination # In[22]: df.rlgblg.value_counts().sort_index() # How religious # In[23]: df.rlgdgr.value_counts().sort_index() # Total household net income, all sources # In[24]: df.hincrank.describe() # Year born # In[25]: df.yrbrn.describe() # In[26]: cdf = thinkstats2.Cdf(df.yrbrn) thinkplot.PrePlot(1) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='Year born', ylabel='CDF', title='Disrtribution of year born', legend=False) # Shifted to mean near 0 # In[27]: df.yrbrn60.describe() # Number of years of education, converted to ranks. # In[28]: df.edurank.describe() # Country codes # In[29]: df.cntry.value_counts().sort_index() # In[30]: df.rlgdgr.value_counts().sort_index() # In[31]: df.inwyr07.describe() # Run the model # In[32]: def run_model(df, formula): model = smf.logit(formula, data=df) results = model.fit(disp=False) return results # Here's the model with all control variables and all media variables: # In[33]: formula = ('hasrelig ~ yrbrn60 + edurank + hincrank +' 'tvtot + rdtot + nwsptot + netuse') res = run_model(df, formula) res.summary() # Now using the filled variables # In[34]: formula = ('hasrelig_f ~ yrbrn60_f + edurank_f + hincrank_f +' 'tvtot_f + rdtot_f + nwsptot_f + netuse_f') res = run_model(df, formula) res.summary() # Now adding inwyr07 # In[35]: formula = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +' 'tvtot_f + rdtot_f + nwsptot_f + netuse_f') res = run_model(df, formula) res.summary() # In[36]: def extract_res(res, var='netuse_f'): param = res.params[var] pvalue = res.pvalues[var] stars = '**' if pvalue < 0.01 else '*' if pvalue < 0.05 else '' return res.nobs, param, stars extract_res(res) # Group by country: # In[37]: grouped = df.groupby('cntry') # Run a sample country # In[38]: group = grouped.get_group('IS') run_model(group, formula).summary() # Run all countries # In[39]: def run_logits(grouped, formula, var): for name, group in grouped: country = '%14.14s' % COUNTRY[name] model = smf.logit(formula, data=group) results = model.fit(disp=False) nobs, param, stars = extract_res(results, var=var) arrow = '<--' if stars and param > 0 else '' print(country, nobs, '%0.3g'%param, stars, arrow, sep='\t') # In[40]: formula = ('hasrelig_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +' 'tvtot_f + rdtot_f + nwsptot_f + netuse_f') run_logits(grouped, formula, 'netuse_f') # In[41]: run_logits(grouped, formula, 'hincrank_f') # In[42]: run_logits(grouped, formula, 'edurank_f') # Run OLS model with `rlgdgr`: "Regardless of whether you belong to a particular religion, how religious would you say you are?" # In[43]: formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + hincrank_f +' 'tvtot_f + rdtot_f + nwsptot_f + netuse_f') model = smf.ols(formula, data=df) results = model.fit(disp=False) results.summary() # In[44]: def run_ols(grouped, formula, var): for name, group in grouped: model = smf.ols(formula, data=group) results = model.fit(disp=False) nobs, param, stars = extract_res(results, var=var) arrow = '<--' if stars and param > 0 else '' print(name, len(group), '%0.3g '%param, stars, arrow, sep='\t') # In[45]: run_ols(grouped, formula, 'netuse_f') # In[46]: run_ols(grouped, formula, 'edurank_f') # In[47]: run_ols(grouped, formula, 'hincrank_f') # Let's see what happens if we add quadratic terms for edurank and hincrank: # In[48]: df['edurank_f2'] = df.edurank_f**2 df['hincrank_f2'] = df.hincrank_f**2 # In[49]: formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + edurank_f2 + hincrank_f +' 'tvtot_f + rdtot_f + nwsptot_f + netuse_f') # In[50]: run_ols(grouped, formula, 'edurank_f') # In[51]: run_ols(grouped, formula, 'edurank_f2') # In[52]: formula = ('rlgdgr_f ~ inwyr07_f + yrbrn60_f + edurank_f + edurank_f2 + ' 'hincrank_f + hincrank_f2 + ' 'tvtot_f + rdtot_f + nwsptot_f + netuse_f') # In[53]: run_ols(grouped, formula, 'hincrank_f') # In[54]: run_ols(grouped, formula, 'edurank_f') # In[55]: run_ols(grouped, formula, 'netuse_f') # In[ ]: