#!/usr/bin/env python # coding: utf-8 # # Welcome to your Quantitative Social Sciences Analysis Toolkit! # # # ## 1. load survey data into the notebook # (run this first & run this everytime you close and reopen the notebook) # In[9]: import pandas as pd # load a specialized piece of software that will help us with the analysis data = pd.read_csv('data/anes_pilot_2019.csv',low_memory=False) # read in the table of data # ## 2. display data # In[10]: data.T # display a snapshot of raw data -- the first column here shows your variables, # the other colums are responses # ## 3. get category counts for a categorical variable # # In[12]: from prettytable import PrettyTable from ipywidgets import interact from IPython.core.display import display, HTML from IPython.html.widgets import SelectMultiple @interact(variable=data.columns.sort_values()) def categorical_table(variable='V161002'): x = PrettyTable() x.field_names = [variable, 'Count'] for i, row in data.groupby(variable).size().reset_index().iterrows(): x.add_row((row[variable], row[0])) display(HTML(x.get_html_string())) # ## 4. get average and spread for a continuous variable # # In[57]: from prettytable import PrettyTable from ipywidgets import interact, widgets import numpy as np from IPython.core.display import display, HTML from matplotlib import pyplot as plt def cast(v): try: return float(v) except: return np.nan variable_select = widgets.Dropdown(options=data.columns.sort_values()) drop_select = widgets.SelectMultiple(options=[]) def update_drop_select(*args): drop_select.options=np.sort(data[variable_select.value].unique()) variable_select.observe(update_drop_select, 'value') def printer(variable, drop_vals, drop_na, zoom=widgets.IntSlider(min=10,max=100,step=5,value=10)): df = data.copy() df[variable] = df[variable].apply(cast) df= df[[v not in drop_vals for v in df[variable]]] if drop_na: df = df[df[variable] > 0] if len(drop_vals): print('dropped values: {}'.format(drop_vals)) x = PrettyTable() x.field_names = [variable, 'mean', 'standard deviation'] mu = np.mean(df[variable]) sigma = np.std(df[variable]) result = (variable, mu, sigma) x.add_row(result) display(HTML(x.get_html_string())) plt.figure(figsize=(10,5)) plt.hist(df[variable], bins=zoom) ax = plt.gca() ymin, ymax = ax.get_ylim() for val in range(-3,3): x = val*sigma+mu col='black' if val==0: ax.vlines(x,ymin,ymax, alpha=1, color='red') else: ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col) interact(printer, drop_vals=drop_select, variable=variable_select, drop_na=True); from prettytable import PrettyTable from ipywidgets import interact, widgets import numpy as np from IPython.core.display import display, HTML from matplotlib import pyplot as plt def cast(v): try: return float(v) except: return np.nan @interact(variable=data.columns.sort_values()) def categorical_table(variable='V161267', zoom=widgets.IntSlider(min=10,max=100,step=5,value=10),drop_na=False): df = data.copy() df[variable] = df[variable].apply(cast) if drop_na: df = df[df[variable] > 0] x = PrettyTable() x.field_names = [variable, 'mean', 'standard deviation'] mu = np.mean(df[variable]) sigma = np.std(df[variable]) result = (variable, mu, sigma) x.add_row(result) display(HTML(x.get_html_string())) plt.figure(figsize=(10,5)) plt.hist(df[variable], bins=zoom) ax = plt.gca() ymin, ymax = ax.get_ylim() for val in range(-3,3): x = val*sigma+mu col='black' if val==0: ax.vlines(x,ymin,ymax, alpha=1, color='red') else: ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col) # ## 5. Compare two categorical variables (or ordinal) # # In[69]: from ipywidgets import interact import scipy.stats as scs from scipy.stats import chi2_contingency dependent_variable_select = widgets.Dropdown(options=data.columns.sort_values()) independent_variable_select = widgets.Dropdown(options=data.columns.sort_values()) dependent_drop_select = widgets.SelectMultiple(options=[]) independent_drop_select = widgets.SelectMultiple(options=[]) def update_dependent_drop_select(*args): dependent_drop_select.options=np.sort(data[dependent_variable_select.value].unique()) def update_independent_drop_select(*args): independent_drop_select.options=np.sort(data[independent_variable_select.value].unique()) dependent_variable_select.observe(update_dependent_drop_select, 'value') independent_variable_select.observe(update_independent_drop_select, 'value') def categorical_table(dependent_variable, independent_variable, dep_drop_vals, indep_drop_vals,drop_na=True): df = data.copy() if drop_na: for variable in [independent_variable, dependent_variable]: try: df[variable] = df[variable].astype(float) df = df[df[variable]>-1] except: pass df = df[[v not in dep_drop_vals for v in df[dependent_variable]]] df = df[[v not in indep_drop_vals for v in df[independent_variable]]] if len(dep_drop_vals): print('dropped dependent values: {}'.format(dep_drop_vals)) if len(indep_drop_vals): print('dropped independent values: {}'.format(indep_drop_vals)) cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable]) stats = chi2_contingency(cross_tab) print("chi-sq = {}, p-val = {}".format(round(stats[0],5), round(stats[1], 5))) return cross_tab interact(categorical_table, dependent_variable=dependent_variable_select , independent_variable=independent_variable_select, dep_drop_vals=dependent_drop_select, indep_drop_vals=independent_drop_select, drop_na=True); # ## 6. Compare a categorical with a numeric/ordinal # In[80]: from ipywidgets import interact import scipy.stats as scs import statsmodels.api as sm from statsmodels.formula.api import ols categorical_variable_select = widgets.Dropdown(options=data.columns.sort_values()) numeric_variable_select = widgets.Dropdown(options=data.columns.sort_values()) categorical_drop_select = widgets.SelectMultiple(options=[]) numeric_drop_select = widgets.SelectMultiple(options=[]) def update_categorical_drop_select(*args): categorical_drop_select.options=np.sort(data[categorical_variable_select.value].unique()) def update_numeric_drop_select(*args): numeric_drop_select.options=np.sort(data[numeric_variable_select.value].unique()) categorical_variable_select.observe(update_categorical_drop_select, 'value') numeric_variable_select.observe(update_numeric_drop_select, 'value') def categorical_table(categorical_variable, numeric_variable,cat_drop_vals, num_drop_vals, drop_na=True): df = data.copy() if drop_na: for variable in [categorical_variable, numeric_variable]: try: df[variable] = df[variable].astype(float) df = df[df[variable]>-1] except: pass if len(df[categorical_variable].unique())>15: print("PLEASE CHOOSE A CATEGORICAL VARIABLE") return try: df[numeric_variable].astype(float) except: print("PLEASE CHOOSE A NUMERIC VARIABLE") return df = df[[v not in cat_drop_vals for v in df[categorical_variable]]] df = df[[v not in num_drop_vals for v in df[numeric_variable]]] if len(cat_drop_vals): print('dropped dependent values: {}'.format(cat_drop_vals)) if len(num_drop_vals): print('dropped independent values: {}'.format(num_drop_vals)) plt.figure(figsize=(10,5)) ax=plt.gca() for c in np.sort(df[categorical_variable].unique()): dat = df[df[categorical_variable]==c] ax.hist(dat[numeric_variable], alpha=.5, bins='doane') ax.legend(np.sort(df[categorical_variable].unique())) X = df[numeric_variable] X = sm.add_constant(X) res = ols("{} ~ C({})".format(numeric_variable, categorical_variable), df).fit() pw = res.t_test_pairwise("C({})".format(categorical_variable)) return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']] interact(categorical_table, categorical_variable=categorical_variable_select, numeric_variable=numeric_variable_select, cat_drop_vals=categorical_drop_select, num_drop_vals=numeric_drop_select, drop_na=True) pass # In[ ]: