Notebook

1. load survey data into the notebook¶

(run this first & run this everytime you close and reopen the notebook)

In [9]:

import pandas as pd # load a specialized piece of software that will help us with the analysis
data = pd.read_csv('data/anes_pilot_2019.csv',low_memory=False) # read in the table of data

2. display data¶

In [10]:

data.T # display a snapshot of raw data -- the first column here shows your variables, 
       # the other colums are responses

Out[10]:

	0	1	2	3	4	5	6	7	8	9	...	3155	3156	3157	3158	3159	3160	3161	3162	3163	3164
version	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	...	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204	ANES 2019 Pilot Study version 20200204
caseid	1	2	3	4	5	6	7	8	9	10	...	3156	3157	3158	3159	3160	3161	3162	3163	3164	3165
weight	1.34719693063187	.780822076219216	.966366930694957	1.10348514780374	1.09069730256741	1.02140871415171	.964514474045239	.83469258858232	1.53541542020853	1.32458088383641	...	1.17827101584555	.783602487218187	.792508744423736				7.03646496881757	.892833236147303	1.58161278448241	.809576969671362
weight_spss	1.10160293017768	.638478211724453	.790198239229266	.902319805359118	.891863184309371	.835205905561853	.788683485426792	.682528129683763	1.25550918910451	1.08310978871303	...	.963472209656906	.640751753798312	.648034400315289				5.75371740500213	.73006973719765	1.29328477387127	.661991088100273
form	1	1	1	2	2	2	2	1	1	1	...	2	1	2	2	1	2	2	2	2	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
starttime	12/31/2019 18:57:33	12/21/2019 4:19:56	12/22/2019 23:03:28	12/31/2019 19:53:14	12/21/2019 4:07:09	12/21/2019 22:45:18	12/27/2019 19:16:05	12/21/2019 23:21:55	12/25/2019 5:39:51	12/28/2019 3:09:16	...	12/31/2019 19:41:53	12/31/2019 19:40:28	12/31/2019 19:40:59	12/31/2019 19:41:26	12/31/2019 19:42:13	12/31/2019 19:38:13	12/31/2019 20:14:34	12/31/2019 20:10:04	12/31/2019 22:10:05	12/31/2019 23:27:51
endtime	12/31/2019 19:39:49	12/21/2019 4:53:19	12/22/2019 23:41:43	12/31/2019 20:23:11	12/21/2019 4:48:50	12/22/2019 0:28:27	12/27/2019 19:45:45	12/21/2019 23:40:20	12/25/2019 5:57:21	12/28/2019 3:35:48	...	12/31/2019 20:08:20	12/31/2019 20:17:50	12/31/2019 20:13:32	12/31/2019 20:22:45	12/31/2019 20:28:23	12/31/2019 20:24:56	12/31/2019 20:53:50	12/31/2019 20:29:15	12/31/2019 22:52:37	1/1/2020 0:21:59
duration	2536	2003	2295	1797	2501	6189	1780	1105	1050	1592	...	1587	2242	1953	2479	2770	2803	2356	1151	2552	3248
pop_density_public	1520	1800	70	7600	4430	11900	700	45000	5700	120	...	400	3700	2000				1800	200	6600	1
flag_state	0	0	0	0	0	0	0	0	0	0	...	0	0	0				0	0	0	0

900 rows × 3165 columns

3. get category counts for a categorical variable¶

In [12]:

from prettytable import PrettyTable
from ipywidgets import interact
from IPython.core.display import display, HTML
from IPython.html.widgets import SelectMultiple

@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161002'):
    x = PrettyTable()
    x.field_names = [variable, 'Count']
    for i, row in data.groupby(variable).size().reset_index().iterrows():
        x.add_row((row[variable], row[0]))
    display(HTML(x.get_html_string()))

interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…

4. get average and spread for a continuous variable¶

In [57]:

from prettytable import PrettyTable
from ipywidgets import interact, widgets
import numpy as np
from IPython.core.display import display, HTML
from matplotlib import pyplot as plt

def cast(v):
    try:
        return float(v)
    except:
        return np.nan


variable_select = widgets.Dropdown(options=data.columns.sort_values())

drop_select = widgets.SelectMultiple(options=[])

def update_drop_select(*args):
    drop_select.options=np.sort(data[variable_select.value].unique())

variable_select.observe(update_drop_select, 'value')

def printer(variable, drop_vals, drop_na, zoom=widgets.IntSlider(min=10,max=100,step=5,value=10)):
    df = data.copy()
    df[variable] = df[variable].apply(cast)
    
    df= df[[v not in drop_vals for v in df[variable]]]

    if drop_na:
        df = df[df[variable] > 0]
    
    if len(drop_vals):
        print('dropped values: {}'.format(drop_vals))
        
    x = PrettyTable()
    x.field_names = [variable, 'mean', 'standard deviation']
    mu = np.mean(df[variable])
    sigma = np.std(df[variable])
    
    result = (variable, mu, sigma)
    x.add_row(result)
    
    display(HTML(x.get_html_string()))
    plt.figure(figsize=(10,5))
    plt.hist(df[variable], bins=zoom)
    ax = plt.gca()
    ymin, ymax = ax.get_ylim()
    
    for val in range(-3,3):
        x = val*sigma+mu 
        col='black'
        
        if val==0:
            ax.vlines(x,ymin,ymax, alpha=1, color='red')
        else:
            ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)
            
interact(printer, drop_vals=drop_select, variable=variable_select, drop_na=True);

interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…

from prettytable import PrettyTable from ipywidgets import interact, widgets import numpy as np from IPython.core.display import display, HTML from matplotlib import pyplot as plt def cast(v): try: return float(v) except: return np.nan @interact(variable=data.columns.sort_values()) def categorical_table(variable='V161267', zoom=widgets.IntSlider(min=10,max=100,step=5,value=10),drop_na=False): df = data.copy() df[variable] = df[variable].apply(cast) if drop_na: df = df[df[variable] > 0] x = PrettyTable() x.field_names = [variable, 'mean', 'standard deviation'] mu = np.mean(df[variable]) sigma = np.std(df[variable]) result = (variable, mu, sigma) x.add_row(result) display(HTML(x.get_html_string())) plt.figure(figsize=(10,5)) plt.hist(df[variable], bins=zoom) ax = plt.gca() ymin, ymax = ax.get_ylim() for val in range(-3,3): x = val*sigma+mu col='black' if val==0: ax.vlines(x,ymin,ymax, alpha=1, color='red') else: ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)

5. Compare two categorical variables (or ordinal)¶

In [69]:

from ipywidgets import interact
import scipy.stats as scs
from scipy.stats import chi2_contingency


dependent_variable_select = widgets.Dropdown(options=data.columns.sort_values())
independent_variable_select = widgets.Dropdown(options=data.columns.sort_values())

dependent_drop_select = widgets.SelectMultiple(options=[])
independent_drop_select = widgets.SelectMultiple(options=[])

def update_dependent_drop_select(*args):
    dependent_drop_select.options=np.sort(data[dependent_variable_select.value].unique())


def update_independent_drop_select(*args):
    independent_drop_select.options=np.sort(data[independent_variable_select.value].unique())

    
dependent_variable_select.observe(update_dependent_drop_select, 'value')
independent_variable_select.observe(update_independent_drop_select, 'value')

def categorical_table(dependent_variable, independent_variable, dep_drop_vals, indep_drop_vals,drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [independent_variable, dependent_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                
                pass
            
    df = df[[v not in dep_drop_vals for v in df[dependent_variable]]]
    df = df[[v not in indep_drop_vals for v in df[independent_variable]]]
            
        
    if len(dep_drop_vals):
        print('dropped dependent values: {}'.format(dep_drop_vals))
        
    if len(indep_drop_vals):
        print('dropped independent values: {}'.format(indep_drop_vals))
        
    cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])
    stats =  chi2_contingency(cross_tab)
    print("chi-sq = {}, p-val = {}".format(round(stats[0],5), round(stats[1], 5)))
    return cross_tab

interact(categorical_table, dependent_variable=dependent_variable_select ,
         independent_variable=independent_variable_select, dep_drop_vals=dependent_drop_select, 
         indep_drop_vals=independent_drop_select, drop_na=True);

interactive(children=(Dropdown(description='dependent_variable', options=('CompletedSurveys', 'EnrollmentDate'…

6. Compare a categorical with a numeric/ordinal¶

In [80]:

from ipywidgets import interact
import scipy.stats as scs
import statsmodels.api as sm
from statsmodels.formula.api import ols


categorical_variable_select = widgets.Dropdown(options=data.columns.sort_values())
numeric_variable_select = widgets.Dropdown(options=data.columns.sort_values())

categorical_drop_select = widgets.SelectMultiple(options=[])
numeric_drop_select = widgets.SelectMultiple(options=[])

def update_categorical_drop_select(*args):
    categorical_drop_select.options=np.sort(data[categorical_variable_select.value].unique())

def update_numeric_drop_select(*args):
    numeric_drop_select.options=np.sort(data[numeric_variable_select.value].unique())

    
categorical_variable_select.observe(update_categorical_drop_select, 'value')
numeric_variable_select.observe(update_numeric_drop_select, 'value')


def categorical_table(categorical_variable, numeric_variable,cat_drop_vals, num_drop_vals, drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [categorical_variable, numeric_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                pass
        
    if len(df[categorical_variable].unique())>15:
        print("PLEASE CHOOSE A CATEGORICAL VARIABLE")
        return
    
    try:
        df[numeric_variable].astype(float)
    except:
        print("PLEASE CHOOSE A NUMERIC VARIABLE")
        return

    
    df = df[[v not in cat_drop_vals for v in df[categorical_variable]]]
    df = df[[v not in num_drop_vals for v in df[numeric_variable]]]
            
        
    if len(cat_drop_vals):
        print('dropped dependent values: {}'.format(cat_drop_vals))
        
    if len(num_drop_vals):
        print('dropped independent values: {}'.format(num_drop_vals))
        
    
    
    plt.figure(figsize=(10,5))
    ax=plt.gca()
    for c in np.sort(df[categorical_variable].unique()):
        dat = df[df[categorical_variable]==c]
        ax.hist(dat[numeric_variable], alpha=.5, bins='doane')
    ax.legend(np.sort(df[categorical_variable].unique()))
    
    X = df[numeric_variable]
    X = sm.add_constant(X)
    
    res = ols("{} ~ C({})".format(numeric_variable, categorical_variable), df).fit()
    pw = res.t_test_pairwise("C({})".format(categorical_variable))
    return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']]
    
    
    
interact(categorical_table, categorical_variable=categorical_variable_select,
         numeric_variable=numeric_variable_select, cat_drop_vals=categorical_drop_select, 
         num_drop_vals=numeric_drop_select, drop_na=True)
pass

interactive(children=(Dropdown(description='categorical_variable', options=('CompletedSurveys', 'EnrollmentDat…

In [ ]:

Welcome to your Quantitative Social Sciences Analysis Toolkit!¶