#!/usr/bin/env python
# coding: utf-8

# ![img](ep8_2018-57x57.png)
# 
# # To bar, or not to bar - what is the intention?
# ## part II
# 
# **post @** [endlesspint.com](http://endlesspint.com/2018-11-07-to-bar-or-not-to-bar/)

# In[1]:


import numpy as np
import pandas as pd
from scipy import stats

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


df_param_stats = pd.read_excel("param_stats.xlsx")
print(df_param_stats.dtypes)
print(df_param_stats.shape)
df_param_stats.head()


# In[3]:


import seaborn as sns

sns.__version__

# dot plot with ci lines for settings
    # or: https://seaborn.pydata.org/generated/seaborn.violinplot.html

# also, CLT: run above N times and grab mean


# In[4]:


ax = sns.violinplot(x="d_threshold", y="mean_cuml", hue="m_nights", data=df_param_stats)


# In[5]:


ax = sns.catplot(x="d_threshold", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate", 
                 data=df_param_stats[df_param_stats.incl_contra==False])
# plt.axhline(y=192, linewidth=5, color='red')


# In[6]:


ax = sns.catplot(x="d_threshold", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate", data=df_param_stats[df_param_stats.incl_contra==True])


# In[7]:


# use decision tree to identify parameters/thresholds to get high score (25+)
    # i.e., change_rate == 0.2, m_nights == 10, etc.


# In[8]:


import statsmodels.api as sm
# code template: http://blog.yhat.com/posts/logistic-regression-and-python.html


# In[9]:


print(df_param_stats.mean_cuml.describe())
df_param_stats.head()


# In[10]:


df_param_stats.columns


# In[11]:


dummy_binary = pd.get_dummies(df_param_stats.binary_eval, prefix="binary")
dummy_binary.head()


# In[12]:


dummy_contrary = pd.get_dummies(df_param_stats.incl_contra, prefix="contrarians")
dummy_contrary.head()


# In[13]:


dummy_top_q = pd.get_dummies(df_param_stats.mean_cuml>192., prefix="top_q")
dummy_top_q.head()


# In[14]:


cols_to_keep = ['d_threshold', 'm_nights', 'change_rate']
df_param_classify = df_param_stats[cols_to_keep].join(dummy_binary.binary_True).join(dummy_contrary.contrarians_True).join(dummy_top_q.top_q_True)
df_param_classify.head()


# In[15]:


df_param_classify['intercept'] = 1.0
df_param_classify.columns


# In[16]:


df_param_classify = df_param_classify[['top_q_True', 'd_threshold', 'm_nights', 'change_rate', 'binary_True', 'contrarians_True', 'intercept']]
df_param_classify.head()


# In[17]:


train_cols = df_param_classify.columns[1:]

logit = sm.Logit(df_param_classify['top_q_True'], df_param_classify[train_cols])

# fit the model
result = logit.fit()


# In[18]:


print(result.summary())


# In[19]:


# odds ratios only
print(np.exp(result.params))

# increases your shot of a good time: 
    # the more nights taken into consideration
    # updating preferences as binary decision: go / don't go
    # BIG ONE: having contrarians in the population
    
# decreases shot at good times:
    # making threshold too tight/close to zero
    # changing preferences too aggresively


# In[20]:


from sklearn import tree
# code template: http://scikit-learn.org/stable/modules/tree.html


# In[21]:


# df_param_classify.columns[1:-1]
X = np.matrix(df_param_classify[['d_threshold', 'm_nights', 'change_rate', 'binary_True',
                                 'contrarians_True']])
y = np.array(df_param_classify.top_q_True)


# In[22]:


X[0]


# In[23]:


clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)


# In[24]:


clf.predict([[-3. , 10. ,  0.9,  1. ,  1. ]])


# In[25]:


clf.predict_proba([[-3. , 10. ,  0.9,  1. ,  1. ]])


# In[26]:


clf.predict_proba([[-3. , 10. ,  0.9,  1. ,  1. ]])[0][1]


# In[27]:


ds = [-3, -2, -1]
ms = [10., 5., 3.]
cr = [.9, .5, .2]
bn = [1., 0.]
ct = [1., 0.]

fun_time_probs = []


for i in range(len(ds)):
    for j in range(len(ms)):
        for k in range(len(cr)):
            for l in range(len(bn)):
                for m in range(len(ct)):
                    
                    temp_dict = {}
                    temp_dict['d_threshold'] = ds[i]
                    temp_dict['m_nights'] = ms[j]
                    temp_dict['change_rate'] = cr[k]
                    temp_dict['binary_eval'] = bn[l]
                    temp_dict['incl_contra'] = ct[m]
                    temp_dict['fun_time'] = clf.predict_proba([[ds[i], ms[j], cr[k], bn[l], ct[m]]])[0][1]
                    
                    fun_time_probs.append(temp_dict)
                    
df_fun_time_probs = pd.DataFrame(fun_time_probs)
df_fun_time_probs = df_fun_time_probs[['d_threshold', 'm_nights', 'change_rate', 
                                 'binary_eval', 'incl_contra', 'fun_time']]
df_fun_time_probs.head()


# In[39]:


# df_fun_time_probs.to_excel("fun_time_probs.xlsx")


# In[73]:


df_param_contr = pd.read_excel("param_stats_high_perf.xlsx", sheet_name="Sheet3")
print(df_param_contr.dtypes)
print(df_param_contr.shape)
df_param_contr.head()


# In[75]:


ax = sns.catplot(x="group", y="mean_cuml", hue="m_nights", 
                 row="binary_eval", col="change_rate", data=df_param_contr)


# In[76]:


df_param_contr = pd.read_excel("param_stats_high_perf.xlsx", sheet_name="Sheet4")
print(df_param_contr.dtypes)
print(df_param_contr.shape)
df_param_contr.head()


# In[79]:


ax = sns.catplot(x="group", y="mean_cuml", hue="m_nights", 
                 row="binary_eval", col="change_rate", data=df_param_contr)


# In[78]:


ax = sns.catplot(x="group", y="mean_cuml", hue="d_threshold", 
                 row="binary_eval", col="change_rate", data=df_param_contr)


# In[50]:


print(np.sum(df_param_contr.hist_cnt > 50))
df_param_contr[df_param_contr.hist_cnt > 50][df_param_contr.mean_cuml_hist > df_param_contr.mean_cuml_contr].shape[0]


# In[49]:


print(np.sum(df_param_contr.hist_cnt < 50))
df_param_contr[df_param_contr.hist_cnt < 50][df_param_contr.mean_cuml_hist < df_param_contr.mean_cuml_contr].shape[0]


# In[31]:


df_param_contr.columns


# In[64]:


df_sub_hist = df_param_contr[['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
                   'setting_run', 'hist_cnt', 'mean_cuml_hist']]

df_sub_hist.columns = [['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
                   'setting_run', 'hist_cnt', 'mean_cuml']]

df_sub_hist['hist'] = 1
print(df_sub_hist.shape)
df_sub_hist.head()


# In[65]:


df_sub_contr = df_param_contr[['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
                   'setting_run', 'hist_cnt', 'mean_cuml_contr']]

df_sub_contr.columns = [['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra',
                   'setting_run', 'hist_cnt', 'mean_cuml']]

df_sub_contr['hist'] = 0
print(df_sub_contr.shape)
df_sub_contr.head()


# In[66]:


frames = [df_sub_hist, df_sub_contr]
df_param_contr_longer = pd.concat(frames, ignore_index=True)

print(df_param_contr_longer.shape)
df_param_contr_longer.head()


# In[67]:


df_param_contr_longer.tail()


# In[68]:


df_param_contr_longer.dtypes


# In[71]:


df_param_contr_longer.shape


# In[59]:


ax = sns.catplot(x="d_threshold", y="mean_cuml_contr", hue="m_nights", row="binary_eval", col="change_rate", 
                 data=df_param_contr)


# In[ ]: