(run this first & run this everytime you close and reopen the notebook)
import pandas as pd # load a specialized piece of software that will help us with the analysis
data = pd.read_csv('data/anes_pilot_2019.csv',low_memory=False) # read in the table of data
data.T # display a snapshot of raw data -- the first column here shows your variables,
# the other colums are responses
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 3155 | 3156 | 3157 | 3158 | 3159 | 3160 | 3161 | 3162 | 3163 | 3164 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
version | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ... | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 | ANES 2019 Pilot Study version 20200204 |
caseid | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 3156 | 3157 | 3158 | 3159 | 3160 | 3161 | 3162 | 3163 | 3164 | 3165 |
weight | 1.34719693063187 | .780822076219216 | .966366930694957 | 1.10348514780374 | 1.09069730256741 | 1.02140871415171 | .964514474045239 | .83469258858232 | 1.53541542020853 | 1.32458088383641 | ... | 1.17827101584555 | .783602487218187 | .792508744423736 | 7.03646496881757 | .892833236147303 | 1.58161278448241 | .809576969671362 | |||
weight_spss | 1.10160293017768 | .638478211724453 | .790198239229266 | .902319805359118 | .891863184309371 | .835205905561853 | .788683485426792 | .682528129683763 | 1.25550918910451 | 1.08310978871303 | ... | .963472209656906 | .640751753798312 | .648034400315289 | 5.75371740500213 | .73006973719765 | 1.29328477387127 | .661991088100273 | |||
form | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 1 | 1 | 1 | ... | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
starttime | 12/31/2019 18:57:33 | 12/21/2019 4:19:56 | 12/22/2019 23:03:28 | 12/31/2019 19:53:14 | 12/21/2019 4:07:09 | 12/21/2019 22:45:18 | 12/27/2019 19:16:05 | 12/21/2019 23:21:55 | 12/25/2019 5:39:51 | 12/28/2019 3:09:16 | ... | 12/31/2019 19:41:53 | 12/31/2019 19:40:28 | 12/31/2019 19:40:59 | 12/31/2019 19:41:26 | 12/31/2019 19:42:13 | 12/31/2019 19:38:13 | 12/31/2019 20:14:34 | 12/31/2019 20:10:04 | 12/31/2019 22:10:05 | 12/31/2019 23:27:51 |
endtime | 12/31/2019 19:39:49 | 12/21/2019 4:53:19 | 12/22/2019 23:41:43 | 12/31/2019 20:23:11 | 12/21/2019 4:48:50 | 12/22/2019 0:28:27 | 12/27/2019 19:45:45 | 12/21/2019 23:40:20 | 12/25/2019 5:57:21 | 12/28/2019 3:35:48 | ... | 12/31/2019 20:08:20 | 12/31/2019 20:17:50 | 12/31/2019 20:13:32 | 12/31/2019 20:22:45 | 12/31/2019 20:28:23 | 12/31/2019 20:24:56 | 12/31/2019 20:53:50 | 12/31/2019 20:29:15 | 12/31/2019 22:52:37 | 1/1/2020 0:21:59 |
duration | 2536 | 2003 | 2295 | 1797 | 2501 | 6189 | 1780 | 1105 | 1050 | 1592 | ... | 1587 | 2242 | 1953 | 2479 | 2770 | 2803 | 2356 | 1151 | 2552 | 3248 |
pop_density_public | 1520 | 1800 | 70 | 7600 | 4430 | 11900 | 700 | 45000 | 5700 | 120 | ... | 400 | 3700 | 2000 | 1800 | 200 | 6600 | 1 | |||
flag_state | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
900 rows × 3165 columns
from prettytable import PrettyTable
from ipywidgets import interact
from IPython.core.display import display, HTML
from IPython.html.widgets import SelectMultiple
@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161002'):
x = PrettyTable()
x.field_names = [variable, 'Count']
for i, row in data.groupby(variable).size().reset_index().iterrows():
x.add_row((row[variable], row[0]))
display(HTML(x.get_html_string()))
interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…
from prettytable import PrettyTable
from ipywidgets import interact, widgets
import numpy as np
from IPython.core.display import display, HTML
from matplotlib import pyplot as plt
def cast(v):
try:
return float(v)
except:
return np.nan
variable_select = widgets.Dropdown(options=data.columns.sort_values())
drop_select = widgets.SelectMultiple(options=[])
def update_drop_select(*args):
drop_select.options=np.sort(data[variable_select.value].unique())
variable_select.observe(update_drop_select, 'value')
def printer(variable, drop_vals, drop_na, zoom=widgets.IntSlider(min=10,max=100,step=5,value=10)):
df = data.copy()
df[variable] = df[variable].apply(cast)
df= df[[v not in drop_vals for v in df[variable]]]
if drop_na:
df = df[df[variable] > 0]
if len(drop_vals):
print('dropped values: {}'.format(drop_vals))
x = PrettyTable()
x.field_names = [variable, 'mean', 'standard deviation']
mu = np.mean(df[variable])
sigma = np.std(df[variable])
result = (variable, mu, sigma)
x.add_row(result)
display(HTML(x.get_html_string()))
plt.figure(figsize=(10,5))
plt.hist(df[variable], bins=zoom)
ax = plt.gca()
ymin, ymax = ax.get_ylim()
for val in range(-3,3):
x = val*sigma+mu
col='black'
if val==0:
ax.vlines(x,ymin,ymax, alpha=1, color='red')
else:
ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)
interact(printer, drop_vals=drop_select, variable=variable_select, drop_na=True);
interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…
from ipywidgets import interact
import scipy.stats as scs
from scipy.stats import chi2_contingency
dependent_variable_select = widgets.Dropdown(options=data.columns.sort_values())
independent_variable_select = widgets.Dropdown(options=data.columns.sort_values())
dependent_drop_select = widgets.SelectMultiple(options=[])
independent_drop_select = widgets.SelectMultiple(options=[])
def update_dependent_drop_select(*args):
dependent_drop_select.options=np.sort(data[dependent_variable_select.value].unique())
def update_independent_drop_select(*args):
independent_drop_select.options=np.sort(data[independent_variable_select.value].unique())
dependent_variable_select.observe(update_dependent_drop_select, 'value')
independent_variable_select.observe(update_independent_drop_select, 'value')
def categorical_table(dependent_variable, independent_variable, dep_drop_vals, indep_drop_vals,drop_na=True):
df = data.copy()
if drop_na:
for variable in [independent_variable, dependent_variable]:
try:
df[variable] = df[variable].astype(float)
df = df[df[variable]>-1]
except:
pass
df = df[[v not in dep_drop_vals for v in df[dependent_variable]]]
df = df[[v not in indep_drop_vals for v in df[independent_variable]]]
if len(dep_drop_vals):
print('dropped dependent values: {}'.format(dep_drop_vals))
if len(indep_drop_vals):
print('dropped independent values: {}'.format(indep_drop_vals))
cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])
stats = chi2_contingency(cross_tab)
print("chi-sq = {}, p-val = {}".format(round(stats[0],5), round(stats[1], 5)))
return cross_tab
interact(categorical_table, dependent_variable=dependent_variable_select ,
independent_variable=independent_variable_select, dep_drop_vals=dependent_drop_select,
indep_drop_vals=independent_drop_select, drop_na=True);
interactive(children=(Dropdown(description='dependent_variable', options=('CompletedSurveys', 'EnrollmentDate'…
from ipywidgets import interact
import scipy.stats as scs
import statsmodels.api as sm
from statsmodels.formula.api import ols
categorical_variable_select = widgets.Dropdown(options=data.columns.sort_values())
numeric_variable_select = widgets.Dropdown(options=data.columns.sort_values())
categorical_drop_select = widgets.SelectMultiple(options=[])
numeric_drop_select = widgets.SelectMultiple(options=[])
def update_categorical_drop_select(*args):
categorical_drop_select.options=np.sort(data[categorical_variable_select.value].unique())
def update_numeric_drop_select(*args):
numeric_drop_select.options=np.sort(data[numeric_variable_select.value].unique())
categorical_variable_select.observe(update_categorical_drop_select, 'value')
numeric_variable_select.observe(update_numeric_drop_select, 'value')
def categorical_table(categorical_variable, numeric_variable,cat_drop_vals, num_drop_vals, drop_na=True):
df = data.copy()
if drop_na:
for variable in [categorical_variable, numeric_variable]:
try:
df[variable] = df[variable].astype(float)
df = df[df[variable]>-1]
except:
pass
if len(df[categorical_variable].unique())>15:
print("PLEASE CHOOSE A CATEGORICAL VARIABLE")
return
try:
df[numeric_variable].astype(float)
except:
print("PLEASE CHOOSE A NUMERIC VARIABLE")
return
df = df[[v not in cat_drop_vals for v in df[categorical_variable]]]
df = df[[v not in num_drop_vals for v in df[numeric_variable]]]
if len(cat_drop_vals):
print('dropped dependent values: {}'.format(cat_drop_vals))
if len(num_drop_vals):
print('dropped independent values: {}'.format(num_drop_vals))
plt.figure(figsize=(10,5))
ax=plt.gca()
for c in np.sort(df[categorical_variable].unique()):
dat = df[df[categorical_variable]==c]
ax.hist(dat[numeric_variable], alpha=.5, bins='doane')
ax.legend(np.sort(df[categorical_variable].unique()))
X = df[numeric_variable]
X = sm.add_constant(X)
res = ols("{} ~ C({})".format(numeric_variable, categorical_variable), df).fit()
pw = res.t_test_pairwise("C({})".format(categorical_variable))
return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']]
interact(categorical_table, categorical_variable=categorical_variable_select,
numeric_variable=numeric_variable_select, cat_drop_vals=categorical_drop_select,
num_drop_vals=numeric_drop_select, drop_na=True)
pass
interactive(children=(Dropdown(description='categorical_variable', options=('CompletedSurveys', 'EnrollmentDat…