#!/usr/bin/env python
# coding: utf-8

# In[76]:


import numpy as np
import random
from tabulate import tabulate


# In[77]:


def sim_data(N, D, p_vl=0.01, p_l=0.1, p_h=0.9, p_skill=0.9):
    #N: num patients
    #D: num doctors
    #p_vl, p_l, p_h: very low, low, and high probability (of recovery)
    #p_skill: probability of patient being assigned a doctor with skill matching severity
    
    #split doctors into two sets and make exactly half of them highly skilled
    beta = np.zeros(D)
    D_sk, D_nsk = D/2, D - D/2 #num skilled v.s. not skilled doctors, account for rounding error when D is odd
    beta[random.sample(range(D),D_sk)] = 1.
    
    #split patients into two sets and make exactly half of them highly severe
    theta = np.zeros(N)
    theta[random.sample(range(N),N/2)] = 1.
    
    S = np.zeros(N) #skill level for each patient
    Z = np.zeros((N,D)) #patient-doctor assignment matrix
    Y = np.zeros(N) #outcome: did the patient recover?
    
    for n in range(N):
        #for each patient choose the skill level of their doctor
        S[n] = np.random.binomial(1, theta[n]*p_skill + (1-theta[n])*(1-p_skill))
        #choose doctor assignment to patient n
        Z[n,:] = np.random.multinomial(1, beta*S[n]/float(D_sk) + (1-beta)*(1-S[n])/float(D_nsk))
        d_n = Z[n,:].argmax()
        p_recovery = S[n]*theta[n]*p_l + (1-S[n])*theta[n]*p_vl + (1-theta[n])*p_h
        Y[n] = np.random.binomial(1, p_recovery)
    
    return Y, Z, theta, beta

N, D = 1000, 10
Y, Z, theta, beta = sim_data(N, D)
print('Y: total, num recovered',N,Y.sum())


# In[78]:


tbl_naive = [(d, int(Z[:,d].sum()), (Y*Z[:,d]).sum()/Z[:,d].sum()) for d in range(D)]
print tabulate(tbl_naive,headers=['doctor id', 'num patients', 'survival rate'],floatfmt=".3f", tablefmt="html")


# In[79]:


def stratify_recovery(Y,Z,N,D,theta):
    tau_strat = np.zeros((2,D))
    tau_strat[0,:] = np.array([((1-theta)*Y*Z[:,d]).sum()/((1-theta)*Z[:,d]).sum() for d in range(D)])
    tau_strat[1,:] = np.array([((theta)*Y*Z[:,d]).sum()/((theta)*Z[:,d]).sum() for d in range(D)])
    return tau_strat

tau_strat = stratify_recovery(Y,Z,N,D,theta)
Nds = map(int,np.dot(theta, Z)) #num severely ill people treated by each doctor
print tabulate(zip(range(D),Nds,tau_strat[0,:],tau_strat[1,:]),
               headers=['doctor id', 'num severe patients', 'recovery rate | non-severe', 'recovery rate | severe'],
               floatfmt=".3f", tablefmt="html")


# In[80]:


def patient_match(Y, Z, N, D, theta):
    #pick the patients of each doctor and match them randomly with another patient of the same severity (high or low)
    #who had another doctor, then find average outcome difference
    #question: they're all getting "treated" there are no controls? Then does this still make sense?

    tau = np.zeros(N) #noisy treatment effect for each patient
    
    for n in range(N):
        #find patients of same severity who were treated by a different doctor:
        dn = Z[n,:].argmax() #doctor of patient n
        g = (theta*theta[n] + (1-theta)*(1-theta[n]))*(1-Z[:,dn])
        #pick one at random uniformly
        match = np.random.multinomial(1, g/g.sum()).argmax()
        #confirm that they have the same severity and have different doctors (this should always be true)
        assert theta[match] == theta[n], 'patients not matched by severity'
        assert Z[match,:].argmax() != Z[n,:].argmax(), 'patients have same doctor (%i, %i)' % (match, n)
        #calculate difference in outcomes:
        tau[n] = Y[n] - Y[match]
        
    #calculate average difference in outcomes over matches for each doctor:
    tau_doctor = np.array([(tau*Z[:,d]).sum()/Z[:,d].sum() for d in range(D)])
    
    return tau_doctor

tau_doctor = patient_match(Y, Z, N, D, theta)


# In[81]:


print('ground truth skill of doctors',beta)


# In[82]:


header_match = ['doctor id','avg treatment effect', 'doctor skill', 'num treated', 'avg recovery rate']
tbl_list_match = [(d,t,int(b),int(Z[:,d].sum()),(Y*Z[:,d]).sum()/Z[:,d].sum()) for t,b,d in zip(tau_doctor, beta, range(D))]
print tabulate(tbl_list_match, headers=header_match,floatfmt=".3f", tablefmt="html")


# In[ ]:


# In[90]:


def propensity_score(Z, N, D, theta):
    p = np.zeros((2,D))
    p[0,:] = [(Z[:,d]*(1-theta)).sum()/(1-theta).sum() for d in range(D)]
    p[1,:] = [(Z[:,d]*(theta)).sum()/(theta).sum() for d in range(D)]
    return p

p = propensity_score(Z, N, D, theta)
print('propensity',p)

def avg_tau_propensity_weighted(Y, Z, p, N, D, theta):
    tau = np.zeros(D)
    itheta = map(int, theta)
    #calculate the weighted effect on the treated (by doctor d)
    eft = np.array([Y*Z[:,d]/p[itheta,d] for d in range(D)])
    #calculate the weighted effect on the untreated (by doctor d)
    efu = np.array([Y*(1-Z[:,d])/(1-p[itheta,d]) for d in range(D)])
    return eft.mean(axis=1), efu.mean(axis=1), (eft - efu).mean(axis=1)
                    
eft, efu, tau_pw = avg_tau_propensity_weighted(Y, Z, p, N, D, theta)
print tabulate([['non-severe']+list(p[0,:5]),['severe']+list(p[1,:5])],
              headers=['unit covariate']+['doc_%i'%d for d in range(5)],
              floatfmt=".3f", tablefmt="html")
print tabulate([['non-severe']+list(p[0,5:]),['severe']+list(p[1,5:])],
              headers=['unit covariate']+['doc_%i'%d for d in range(5,D)],
              floatfmt=".3f", tablefmt="html")
print tabulate(zip(range(D),map(int,beta),eft,efu,tau_pw),
               headers=['doctor id','doctor skill','treated recovery','untreated recovery','avg treatment effect'],
              floatfmt=".3f", tablefmt="html")


# In[ ]:


# In[ ]:


# In[ ]: