#!/usr/bin/env python
# coding: utf-8

# # Welcome to your Quantitative Social Sciences Analysis Toolkit!
# 
# 

# ## 1. load survey data into the notebook 
# (run this first & run this everytime you close and reopen the notebook)

# In[9]:


import pandas as pd # load a specialized piece of software that will help us with the analysis
data = pd.read_csv('data/anes_pilot_2019.csv',low_memory=False) # read in the table of data


# ## 2. display data

# In[10]:


data.T # display a snapshot of raw data -- the first column here shows your variables, 
       # the other colums are responses


# ## 3. get category counts for a categorical variable
# 

# In[12]:


from prettytable import PrettyTable
from ipywidgets import interact
from IPython.core.display import display, HTML
from IPython.html.widgets import SelectMultiple

@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161002'):
    x = PrettyTable()
    x.field_names = [variable, 'Count']
    for i, row in data.groupby(variable).size().reset_index().iterrows():
        x.add_row((row[variable], row[0]))
    display(HTML(x.get_html_string()))


# ## 4. get average and spread for a continuous variable
# 

# In[57]:


from prettytable import PrettyTable
from ipywidgets import interact, widgets
import numpy as np
from IPython.core.display import display, HTML
from matplotlib import pyplot as plt

def cast(v):
    try:
        return float(v)
    except:
        return np.nan


variable_select = widgets.Dropdown(options=data.columns.sort_values())

drop_select = widgets.SelectMultiple(options=[])

def update_drop_select(*args):
    drop_select.options=np.sort(data[variable_select.value].unique())

variable_select.observe(update_drop_select, 'value')

def printer(variable, drop_vals, drop_na, zoom=widgets.IntSlider(min=10,max=100,step=5,value=10)):
    df = data.copy()
    df[variable] = df[variable].apply(cast)
    
    df= df[[v not in drop_vals for v in df[variable]]]

    if drop_na:
        df = df[df[variable] > 0]
    
    if len(drop_vals):
        print('dropped values: {}'.format(drop_vals))
        
    x = PrettyTable()
    x.field_names = [variable, 'mean', 'standard deviation']
    mu = np.mean(df[variable])
    sigma = np.std(df[variable])
    
    result = (variable, mu, sigma)
    x.add_row(result)
    
    display(HTML(x.get_html_string()))
    plt.figure(figsize=(10,5))
    plt.hist(df[variable], bins=zoom)
    ax = plt.gca()
    ymin, ymax = ax.get_ylim()
    
    for val in range(-3,3):
        x = val*sigma+mu 
        col='black'
        
        if val==0:
            ax.vlines(x,ymin,ymax, alpha=1, color='red')
        else:
            ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)
            
interact(printer, drop_vals=drop_select, variable=variable_select, drop_na=True);

from prettytable import PrettyTable
from ipywidgets import interact, widgets
import numpy as np
from IPython.core.display import display, HTML
from matplotlib import pyplot as plt

def cast(v):
    try:
        return float(v)
    except:
        return np.nan


@interact(variable=data.columns.sort_values())
def categorical_table(variable='V161267', zoom=widgets.IntSlider(min=10,max=100,step=5,value=10),drop_na=False):
    df = data.copy()
    df[variable] = df[variable].apply(cast)
    if drop_na:
        df = df[df[variable] > 0]
    
    x = PrettyTable()
    x.field_names = [variable, 'mean', 'standard deviation']
    mu = np.mean(df[variable])
    sigma = np.std(df[variable])
    
    result = (variable, mu, sigma)
    x.add_row(result)
    
    display(HTML(x.get_html_string()))
    plt.figure(figsize=(10,5))
    plt.hist(df[variable], bins=zoom)
    ax = plt.gca()
    ymin, ymax = ax.get_ylim()
    
    for val in range(-3,3):
        x = val*sigma+mu 
        col='black'
        
        if val==0:
            ax.vlines(x,ymin,ymax, alpha=1, color='red')
        else:
            ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)
            
# ## 5. Compare two categorical variables (or ordinal)
#  

# In[69]:


from ipywidgets import interact
import scipy.stats as scs
from scipy.stats import chi2_contingency


dependent_variable_select = widgets.Dropdown(options=data.columns.sort_values())
independent_variable_select = widgets.Dropdown(options=data.columns.sort_values())

dependent_drop_select = widgets.SelectMultiple(options=[])
independent_drop_select = widgets.SelectMultiple(options=[])

def update_dependent_drop_select(*args):
    dependent_drop_select.options=np.sort(data[dependent_variable_select.value].unique())


def update_independent_drop_select(*args):
    independent_drop_select.options=np.sort(data[independent_variable_select.value].unique())

    
dependent_variable_select.observe(update_dependent_drop_select, 'value')
independent_variable_select.observe(update_independent_drop_select, 'value')

def categorical_table(dependent_variable, independent_variable, dep_drop_vals, indep_drop_vals,drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [independent_variable, dependent_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                
                pass
            
    df = df[[v not in dep_drop_vals for v in df[dependent_variable]]]
    df = df[[v not in indep_drop_vals for v in df[independent_variable]]]
            
        
    if len(dep_drop_vals):
        print('dropped dependent values: {}'.format(dep_drop_vals))
        
    if len(indep_drop_vals):
        print('dropped independent values: {}'.format(indep_drop_vals))
        
    cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])
    stats =  chi2_contingency(cross_tab)
    print("chi-sq = {}, p-val = {}".format(round(stats[0],5), round(stats[1], 5)))
    return cross_tab

interact(categorical_table, dependent_variable=dependent_variable_select ,
         independent_variable=independent_variable_select, dep_drop_vals=dependent_drop_select, 
         indep_drop_vals=independent_drop_select, drop_na=True);


# ## 6. Compare a categorical with a numeric/ordinal

# In[80]:


from ipywidgets import interact
import scipy.stats as scs
import statsmodels.api as sm
from statsmodels.formula.api import ols


categorical_variable_select = widgets.Dropdown(options=data.columns.sort_values())
numeric_variable_select = widgets.Dropdown(options=data.columns.sort_values())

categorical_drop_select = widgets.SelectMultiple(options=[])
numeric_drop_select = widgets.SelectMultiple(options=[])

def update_categorical_drop_select(*args):
    categorical_drop_select.options=np.sort(data[categorical_variable_select.value].unique())

def update_numeric_drop_select(*args):
    numeric_drop_select.options=np.sort(data[numeric_variable_select.value].unique())

    
categorical_variable_select.observe(update_categorical_drop_select, 'value')
numeric_variable_select.observe(update_numeric_drop_select, 'value')


def categorical_table(categorical_variable, numeric_variable,cat_drop_vals, num_drop_vals, drop_na=True):
    df = data.copy()
    
    if drop_na:
        for variable in [categorical_variable, numeric_variable]:
            try:
                df[variable] = df[variable].astype(float)
                df = df[df[variable]>-1]
            except:
                pass
        
    if len(df[categorical_variable].unique())>15:
        print("PLEASE CHOOSE A CATEGORICAL VARIABLE")
        return
    
    try:
        df[numeric_variable].astype(float)
    except:
        print("PLEASE CHOOSE A NUMERIC VARIABLE")
        return

    
    df = df[[v not in cat_drop_vals for v in df[categorical_variable]]]
    df = df[[v not in num_drop_vals for v in df[numeric_variable]]]
            
        
    if len(cat_drop_vals):
        print('dropped dependent values: {}'.format(cat_drop_vals))
        
    if len(num_drop_vals):
        print('dropped independent values: {}'.format(num_drop_vals))
        
    
    plt.figure(figsize=(10,5))
    ax=plt.gca()
    for c in np.sort(df[categorical_variable].unique()):
        dat = df[df[categorical_variable]==c]
        ax.hist(dat[numeric_variable], alpha=.5, bins='doane')
    ax.legend(np.sort(df[categorical_variable].unique()))
    
    X = df[numeric_variable]
    X = sm.add_constant(X)
    
    res = ols("{} ~ C({})".format(numeric_variable, categorical_variable), df).fit()
    pw = res.t_test_pairwise("C({})".format(categorical_variable))
    return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']]
    
    
interact(categorical_table, categorical_variable=categorical_variable_select,
         numeric_variable=numeric_variable_select, cat_drop_vals=categorical_drop_select, 
         num_drop_vals=numeric_drop_select, drop_na=True)
pass


# In[ ]: