#!/usr/bin/env python # coding: utf-8 # ![img](ep8_2018-57x57.png) # # # To bar, or not to bar - what is the intention? # ## part II # # **post @** [endlesspint.com](http://endlesspint.com/2018-11-07-to-bar-or-not-to-bar/) # In[1]: import numpy as np import pandas as pd from scipy import stats import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: df_param_stats = pd.read_excel("param_stats.xlsx") print(df_param_stats.dtypes) print(df_param_stats.shape) df_param_stats.head() # In[3]: import seaborn as sns sns.__version__ # dot plot with ci lines for settings # or: https://seaborn.pydata.org/generated/seaborn.violinplot.html # also, CLT: run above N times and grab mean # In[4]: ax = sns.violinplot(x="d_threshold", y="mean_cuml", hue="m_nights", data=df_param_stats) # In[5]: ax = sns.catplot(x="d_threshold", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate", data=df_param_stats[df_param_stats.incl_contra==False]) # plt.axhline(y=192, linewidth=5, color='red') # In[6]: ax = sns.catplot(x="d_threshold", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate", data=df_param_stats[df_param_stats.incl_contra==True]) # In[7]: # use decision tree to identify parameters/thresholds to get high score (25+) # i.e., change_rate == 0.2, m_nights == 10, etc. # In[8]: import statsmodels.api as sm # code template: http://blog.yhat.com/posts/logistic-regression-and-python.html # In[9]: print(df_param_stats.mean_cuml.describe()) df_param_stats.head() # In[10]: df_param_stats.columns # In[11]: dummy_binary = pd.get_dummies(df_param_stats.binary_eval, prefix="binary") dummy_binary.head() # In[12]: dummy_contrary = pd.get_dummies(df_param_stats.incl_contra, prefix="contrarians") dummy_contrary.head() # In[13]: dummy_top_q = pd.get_dummies(df_param_stats.mean_cuml>192., prefix="top_q") dummy_top_q.head() # In[14]: cols_to_keep = ['d_threshold', 'm_nights', 'change_rate'] df_param_classify = df_param_stats[cols_to_keep].join(dummy_binary.binary_True).join(dummy_contrary.contrarians_True).join(dummy_top_q.top_q_True) df_param_classify.head() # In[15]: df_param_classify['intercept'] = 1.0 df_param_classify.columns # In[16]: df_param_classify = df_param_classify[['top_q_True', 'd_threshold', 'm_nights', 'change_rate', 'binary_True', 'contrarians_True', 'intercept']] df_param_classify.head() # In[17]: train_cols = df_param_classify.columns[1:] logit = sm.Logit(df_param_classify['top_q_True'], df_param_classify[train_cols]) # fit the model result = logit.fit() # In[18]: print(result.summary()) # In[19]: # odds ratios only print(np.exp(result.params)) # increases your shot of a good time: # the more nights taken into consideration # updating preferences as binary decision: go / don't go # BIG ONE: having contrarians in the population # decreases shot at good times: # making threshold too tight/close to zero # changing preferences too aggresively # In[20]: from sklearn import tree # code template: http://scikit-learn.org/stable/modules/tree.html # In[21]: # df_param_classify.columns[1:-1] X = np.matrix(df_param_classify[['d_threshold', 'm_nights', 'change_rate', 'binary_True', 'contrarians_True']]) y = np.array(df_param_classify.top_q_True) # In[22]: X[0] # In[23]: clf = tree.DecisionTreeClassifier() clf = clf.fit(X, y) # In[24]: clf.predict([[-3. , 10. , 0.9, 1. , 1. ]]) # In[25]: clf.predict_proba([[-3. , 10. , 0.9, 1. , 1. ]]) # In[26]: clf.predict_proba([[-3. , 10. , 0.9, 1. , 1. ]])[0][1] # In[27]: ds = [-3, -2, -1] ms = [10., 5., 3.] cr = [.9, .5, .2] bn = [1., 0.] ct = [1., 0.] fun_time_probs = [] for i in range(len(ds)): for j in range(len(ms)): for k in range(len(cr)): for l in range(len(bn)): for m in range(len(ct)): temp_dict = {} temp_dict['d_threshold'] = ds[i] temp_dict['m_nights'] = ms[j] temp_dict['change_rate'] = cr[k] temp_dict['binary_eval'] = bn[l] temp_dict['incl_contra'] = ct[m] temp_dict['fun_time'] = clf.predict_proba([[ds[i], ms[j], cr[k], bn[l], ct[m]]])[0][1] fun_time_probs.append(temp_dict) df_fun_time_probs = pd.DataFrame(fun_time_probs) df_fun_time_probs = df_fun_time_probs[['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra', 'fun_time']] df_fun_time_probs.head() # In[39]: # df_fun_time_probs.to_excel("fun_time_probs.xlsx") # In[73]: df_param_contr = pd.read_excel("param_stats_high_perf.xlsx", sheet_name="Sheet3") print(df_param_contr.dtypes) print(df_param_contr.shape) df_param_contr.head() # In[75]: ax = sns.catplot(x="group", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate", data=df_param_contr) # In[76]: df_param_contr = pd.read_excel("param_stats_high_perf.xlsx", sheet_name="Sheet4") print(df_param_contr.dtypes) print(df_param_contr.shape) df_param_contr.head() # In[79]: ax = sns.catplot(x="group", y="mean_cuml", hue="m_nights", row="binary_eval", col="change_rate", data=df_param_contr) # In[78]: ax = sns.catplot(x="group", y="mean_cuml", hue="d_threshold", row="binary_eval", col="change_rate", data=df_param_contr) # In[50]: print(np.sum(df_param_contr.hist_cnt > 50)) df_param_contr[df_param_contr.hist_cnt > 50][df_param_contr.mean_cuml_hist > df_param_contr.mean_cuml_contr].shape[0] # In[49]: print(np.sum(df_param_contr.hist_cnt < 50)) df_param_contr[df_param_contr.hist_cnt < 50][df_param_contr.mean_cuml_hist < df_param_contr.mean_cuml_contr].shape[0] # In[31]: df_param_contr.columns # In[64]: df_sub_hist = df_param_contr[['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra', 'setting_run', 'hist_cnt', 'mean_cuml_hist']] df_sub_hist.columns = [['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra', 'setting_run', 'hist_cnt', 'mean_cuml']] df_sub_hist['hist'] = 1 print(df_sub_hist.shape) df_sub_hist.head() # In[65]: df_sub_contr = df_param_contr[['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra', 'setting_run', 'hist_cnt', 'mean_cuml_contr']] df_sub_contr.columns = [['d_threshold', 'm_nights', 'change_rate', 'binary_eval', 'incl_contra', 'setting_run', 'hist_cnt', 'mean_cuml']] df_sub_contr['hist'] = 0 print(df_sub_contr.shape) df_sub_contr.head() # In[66]: frames = [df_sub_hist, df_sub_contr] df_param_contr_longer = pd.concat(frames, ignore_index=True) print(df_param_contr_longer.shape) df_param_contr_longer.head() # In[67]: df_param_contr_longer.tail() # In[68]: df_param_contr_longer.dtypes # In[71]: df_param_contr_longer.shape # In[59]: ax = sns.catplot(x="d_threshold", y="mean_cuml_contr", hue="m_nights", row="binary_eval", col="change_rate", data=df_param_contr) # In[ ]: