import numpy as np
import random
from tabulate import tabulate
def sim_data(N, D, p_vl=0.01, p_l=0.1, p_h=0.9, p_skill=0.9):
#N: num patients
#D: num doctors
#p_vl, p_l, p_h: very low, low, and high probability (of recovery)
#p_skill: probability of patient being assigned a doctor with skill matching severity
#split doctors into two sets and make exactly half of them highly skilled
beta = np.zeros(D)
D_sk, D_nsk = D/2, D - D/2 #num skilled v.s. not skilled doctors, account for rounding error when D is odd
beta[random.sample(range(D),D_sk)] = 1.
#split patients into two sets and make exactly half of them highly severe
theta = np.zeros(N)
theta[random.sample(range(N),N/2)] = 1.
S = np.zeros(N) #skill level for each patient
Z = np.zeros((N,D)) #patient-doctor assignment matrix
Y = np.zeros(N) #outcome: did the patient recover?
for n in range(N):
#for each patient choose the skill level of their doctor
S[n] = np.random.binomial(1, theta[n]*p_skill + (1-theta[n])*(1-p_skill))
#choose doctor assignment to patient n
Z[n,:] = np.random.multinomial(1, beta*S[n]/float(D_sk) + (1-beta)*(1-S[n])/float(D_nsk))
d_n = Z[n,:].argmax()
p_recovery = S[n]*theta[n]*p_l + (1-S[n])*theta[n]*p_vl + (1-theta[n])*p_h
Y[n] = np.random.binomial(1, p_recovery)
return Y, Z, theta, beta
N, D = 1000, 10
Y, Z, theta, beta = sim_data(N, D)
print('Y: total, num recovered',N,Y.sum())
('Y: total, num recovered', 1000, 493.0)
tbl_naive = [(d, int(Z[:,d].sum()), (Y*Z[:,d]).sum()/Z[:,d].sum()) for d in range(D)]
print tabulate(tbl_naive,headers=['doctor id', 'num patients', 'survival rate'],floatfmt=".3f", tablefmt="html")
<table> <tr><th style="text-align: right;"> doctor id</th><th style="text-align: right;"> num patients</th><th style="text-align: right;"> survival rate</th></tr> <tr><td style="text-align: right;"> 0</td><td style="text-align: right;"> 117</td><td style="text-align: right;"> 0.205</td></tr> <tr><td style="text-align: right;"> 1</td><td style="text-align: right;"> 85</td><td style="text-align: right;"> 0.812</td></tr> <tr><td style="text-align: right;"> 2</td><td style="text-align: right;"> 93</td><td style="text-align: right;"> 0.129</td></tr> <tr><td style="text-align: right;"> 3</td><td style="text-align: right;"> 88</td><td style="text-align: right;"> 0.216</td></tr> <tr><td style="text-align: right;"> 4</td><td style="text-align: right;"> 98</td><td style="text-align: right;"> 0.153</td></tr> <tr><td style="text-align: right;"> 5</td><td style="text-align: right;"> 112</td><td style="text-align: right;"> 0.830</td></tr> <tr><td style="text-align: right;"> 6</td><td style="text-align: right;"> 89</td><td style="text-align: right;"> 0.809</td></tr> <tr><td style="text-align: right;"> 7</td><td style="text-align: right;"> 109</td><td style="text-align: right;"> 0.138</td></tr> <tr><td style="text-align: right;"> 8</td><td style="text-align: right;"> 102</td><td style="text-align: right;"> 0.824</td></tr> <tr><td style="text-align: right;"> 9</td><td style="text-align: right;"> 107</td><td style="text-align: right;"> 0.841</td></tr> </table>
def stratify_recovery(Y,Z,N,D,theta):
tau_strat = np.zeros((2,D))
tau_strat[0,:] = np.array([((1-theta)*Y*Z[:,d]).sum()/((1-theta)*Z[:,d]).sum() for d in range(D)])
tau_strat[1,:] = np.array([((theta)*Y*Z[:,d]).sum()/((theta)*Z[:,d]).sum() for d in range(D)])
return tau_strat
tau_strat = stratify_recovery(Y,Z,N,D,theta)
Nds = map(int,np.dot(theta, Z)) #num severely ill people treated by each doctor
print tabulate(zip(range(D),Nds,tau_strat[0,:],tau_strat[1,:]),
headers=['doctor id', 'num severe patients', 'recovery rate | non-severe', 'recovery rate | severe'],
floatfmt=".3f", tablefmt="html")
<table> <tr><th style="text-align: right;"> doctor id</th><th style="text-align: right;"> num severe patients</th><th style="text-align: right;"> recovery rate | non-severe</th><th style="text-align: right;"> recovery rate | severe</th></tr> <tr><td style="text-align: right;"> 0</td><td style="text-align: right;"> 103</td><td style="text-align: right;"> 0.857</td><td style="text-align: right;"> 0.117</td></tr> <tr><td style="text-align: right;"> 1</td><td style="text-align: right;"> 7</td><td style="text-align: right;"> 0.885</td><td style="text-align: right;"> 0.000</td></tr> <tr><td style="text-align: right;"> 2</td><td style="text-align: right;"> 87</td><td style="text-align: right;"> 0.833</td><td style="text-align: right;"> 0.080</td></tr> <tr><td style="text-align: right;"> 3</td><td style="text-align: right;"> 78</td><td style="text-align: right;"> 0.900</td><td style="text-align: right;"> 0.128</td></tr> <tr><td style="text-align: right;"> 4</td><td style="text-align: right;"> 92</td><td style="text-align: right;"> 1.000</td><td style="text-align: right;"> 0.098</td></tr> <tr><td style="text-align: right;"> 5</td><td style="text-align: right;"> 6</td><td style="text-align: right;"> 0.877</td><td style="text-align: right;"> 0.000</td></tr> <tr><td style="text-align: right;"> 6</td><td style="text-align: right;"> 11</td><td style="text-align: right;"> 0.923</td><td style="text-align: right;"> 0.000</td></tr> <tr><td style="text-align: right;"> 7</td><td style="text-align: right;"> 97</td><td style="text-align: right;"> 0.917</td><td style="text-align: right;"> 0.041</td></tr> <tr><td style="text-align: right;"> 8</td><td style="text-align: right;"> 11</td><td style="text-align: right;"> 0.923</td><td style="text-align: right;"> 0.000</td></tr> <tr><td style="text-align: right;"> 9</td><td style="text-align: right;"> 8</td><td style="text-align: right;"> 0.909</td><td style="text-align: right;"> 0.000</td></tr> </table>
def patient_match(Y, Z, N, D, theta):
#pick the patients of each doctor and match them randomly with another patient of the same severity (high or low)
#who had another doctor, then find average outcome difference
#question: they're all getting "treated" there are no controls? Then does this still make sense?
tau = np.zeros(N) #noisy treatment effect for each patient
for n in range(N):
#find patients of same severity who were treated by a different doctor:
dn = Z[n,:].argmax() #doctor of patient n
g = (theta*theta[n] + (1-theta)*(1-theta[n]))*(1-Z[:,dn])
#pick one at random uniformly
match = np.random.multinomial(1, g/g.sum()).argmax()
#confirm that they have the same severity and have different doctors (this should always be true)
assert theta[match] == theta[n], 'patients not matched by severity'
assert Z[match,:].argmax() != Z[n,:].argmax(), 'patients have same doctor (%i, %i)' % (match, n)
#calculate difference in outcomes:
tau[n] = Y[n] - Y[match]
#calculate average difference in outcomes over matches for each doctor:
tau_doctor = np.array([(tau*Z[:,d]).sum()/Z[:,d].sum() for d in range(D)])
return tau_doctor
tau_doctor = patient_match(Y, Z, N, D, theta)
print('ground truth skill of doctors',beta)
('ground truth skill of doctors', array([ 1., 0., 1., 1., 1., 0., 0., 1., 0., 0.]))
header_match = ['doctor id','avg treatment effect', 'doctor skill', 'num treated', 'avg recovery rate']
tbl_list_match = [(d,t,int(b),int(Z[:,d].sum()),(Y*Z[:,d]).sum()/Z[:,d].sum()) for t,b,d in zip(tau_doctor, beta, range(D))]
print tabulate(tbl_list_match, headers=header_match,floatfmt=".3f", tablefmt="html")
<table> <tr><th style="text-align: right;"> doctor id</th><th style="text-align: right;"> avg treatment effect</th><th style="text-align: right;"> doctor skill</th><th style="text-align: right;"> num treated</th><th style="text-align: right;"> avg recovery rate</th></tr> <tr><td style="text-align: right;"> 0</td><td style="text-align: right;"> 0.000</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 117</td><td style="text-align: right;"> 0.205</td></tr> <tr><td style="text-align: right;"> 1</td><td style="text-align: right;"> -0.024</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 85</td><td style="text-align: right;"> 0.812</td></tr> <tr><td style="text-align: right;"> 2</td><td style="text-align: right;"> 0.000</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 93</td><td style="text-align: right;"> 0.129</td></tr> <tr><td style="text-align: right;"> 3</td><td style="text-align: right;"> 0.023</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 88</td><td style="text-align: right;"> 0.216</td></tr> <tr><td style="text-align: right;"> 4</td><td style="text-align: right;"> -0.020</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 98</td><td style="text-align: right;"> 0.153</td></tr> <tr><td style="text-align: right;"> 5</td><td style="text-align: right;"> -0.080</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 112</td><td style="text-align: right;"> 0.830</td></tr> <tr><td style="text-align: right;"> 6</td><td style="text-align: right;"> 0.034</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 89</td><td style="text-align: right;"> 0.809</td></tr> <tr><td style="text-align: right;"> 7</td><td style="text-align: right;"> -0.064</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 109</td><td style="text-align: right;"> 0.138</td></tr> <tr><td style="text-align: right;"> 8</td><td style="text-align: right;"> 0.010</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 102</td><td style="text-align: right;"> 0.824</td></tr> <tr><td style="text-align: right;"> 9</td><td style="text-align: right;"> 0.037</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 107</td><td style="text-align: right;"> 0.841</td></tr> </table>
def propensity_score(Z, N, D, theta):
p = np.zeros((2,D))
p[0,:] = [(Z[:,d]*(1-theta)).sum()/(1-theta).sum() for d in range(D)]
p[1,:] = [(Z[:,d]*(theta)).sum()/(theta).sum() for d in range(D)]
return p
p = propensity_score(Z, N, D, theta)
print('propensity',p)
def avg_tau_propensity_weighted(Y, Z, p, N, D, theta):
tau = np.zeros(D)
itheta = map(int, theta)
#calculate the weighted effect on the treated (by doctor d)
eft = np.array([Y*Z[:,d]/p[itheta,d] for d in range(D)])
#calculate the weighted effect on the untreated (by doctor d)
efu = np.array([Y*(1-Z[:,d])/(1-p[itheta,d]) for d in range(D)])
return eft.mean(axis=1), efu.mean(axis=1), (eft - efu).mean(axis=1)
eft, efu, tau_pw = avg_tau_propensity_weighted(Y, Z, p, N, D, theta)
print tabulate([['non-severe']+list(p[0,:5]),['severe']+list(p[1,:5])],
headers=['unit covariate']+['doc_%i'%d for d in range(5)],
floatfmt=".3f", tablefmt="html")
print tabulate([['non-severe']+list(p[0,5:]),['severe']+list(p[1,5:])],
headers=['unit covariate']+['doc_%i'%d for d in range(5,D)],
floatfmt=".3f", tablefmt="html")
print tabulate(zip(range(D),map(int,beta),eft,efu,tau_pw),
headers=['doctor id','doctor skill','treated recovery','untreated recovery','avg treatment effect'],
floatfmt=".3f", tablefmt="html")
('propensity', array([[ 0.028, 0.156, 0.012, 0.02 , 0.012, 0.212, 0.156, 0.024, 0.182, 0.198], [ 0.206, 0.014, 0.174, 0.156, 0.184, 0.012, 0.022, 0.194, 0.022, 0.016]])) <table> <tr><th>unit covariate </th><th style="text-align: right;"> doc_0</th><th style="text-align: right;"> doc_1</th><th style="text-align: right;"> doc_2</th><th style="text-align: right;"> doc_3</th><th style="text-align: right;"> doc_4</th></tr> <tr><td>non-severe </td><td style="text-align: right;"> 0.028</td><td style="text-align: right;"> 0.156</td><td style="text-align: right;"> 0.012</td><td style="text-align: right;"> 0.020</td><td style="text-align: right;"> 0.012</td></tr> <tr><td>severe </td><td style="text-align: right;"> 0.206</td><td style="text-align: right;"> 0.014</td><td style="text-align: right;"> 0.174</td><td style="text-align: right;"> 0.156</td><td style="text-align: right;"> 0.184</td></tr> </table> <table> <tr><th>unit covariate </th><th style="text-align: right;"> doc_5</th><th style="text-align: right;"> doc_6</th><th style="text-align: right;"> doc_7</th><th style="text-align: right;"> doc_8</th><th style="text-align: right;"> doc_9</th></tr> <tr><td>non-severe </td><td style="text-align: right;"> 0.212</td><td style="text-align: right;"> 0.156</td><td style="text-align: right;"> 0.024</td><td style="text-align: right;"> 0.182</td><td style="text-align: right;"> 0.198</td></tr> <tr><td>severe </td><td style="text-align: right;"> 0.012</td><td style="text-align: right;"> 0.022</td><td style="text-align: right;"> 0.194</td><td style="text-align: right;"> 0.022</td><td style="text-align: right;"> 0.016</td></tr> </table> <table> <tr><th style="text-align: right;"> doctor id</th><th style="text-align: right;"> doctor skill</th><th style="text-align: right;"> treated recovery</th><th style="text-align: right;"> untreated recovery</th><th style="text-align: right;"> avg treatment effect</th></tr> <tr><td style="text-align: right;"> 0</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 0.487</td><td style="text-align: right;"> 0.489</td><td style="text-align: right;"> -0.003</td></tr> <tr><td style="text-align: right;"> 1</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 0.442</td><td style="text-align: right;"> 0.495</td><td style="text-align: right;"> -0.053</td></tr> <tr><td style="text-align: right;"> 2</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 0.457</td><td style="text-align: right;"> 0.494</td><td style="text-align: right;"> -0.037</td></tr> <tr><td style="text-align: right;"> 3</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 0.514</td><td style="text-align: right;"> 0.489</td><td style="text-align: right;"> 0.025</td></tr> <tr><td style="text-align: right;"> 4</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 0.549</td><td style="text-align: right;"> 0.491</td><td style="text-align: right;"> 0.058</td></tr> <tr><td style="text-align: right;"> 5</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 0.439</td><td style="text-align: right;"> 0.497</td><td style="text-align: right;"> -0.058</td></tr> <tr><td style="text-align: right;"> 6</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 0.462</td><td style="text-align: right;"> 0.492</td><td style="text-align: right;"> -0.030</td></tr> <tr><td style="text-align: right;"> 7</td><td style="text-align: right;"> 1</td><td style="text-align: right;"> 0.479</td><td style="text-align: right;"> 0.498</td><td style="text-align: right;"> -0.019</td></tr> <tr><td style="text-align: right;"> 8</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 0.462</td><td style="text-align: right;"> 0.492</td><td style="text-align: right;"> -0.030</td></tr> <tr><td style="text-align: right;"> 9</td><td style="text-align: right;"> 0</td><td style="text-align: right;"> 0.455</td><td style="text-align: right;"> 0.493</td><td style="text-align: right;"> -0.038</td></tr> </table>