In [76]:
import numpy as np
import random
from tabulate import tabulate

In [77]:
def sim_data(N, D, p_vl=0.01, p_l=0.1, p_h=0.9, p_skill=0.9):
#N: num patients
#D: num doctors
#p_vl, p_l, p_h: very low, low, and high probability (of recovery)
#p_skill: probability of patient being assigned a doctor with skill matching severity

#split doctors into two sets and make exactly half of them highly skilled
beta = np.zeros(D)
D_sk, D_nsk = D/2, D - D/2 #num skilled v.s. not skilled doctors, account for rounding error when D is odd
beta[random.sample(range(D),D_sk)] = 1.

#split patients into two sets and make exactly half of them highly severe
theta = np.zeros(N)
theta[random.sample(range(N),N/2)] = 1.

S = np.zeros(N) #skill level for each patient
Z = np.zeros((N,D)) #patient-doctor assignment matrix
Y = np.zeros(N) #outcome: did the patient recover?

for n in range(N):
#for each patient choose the skill level of their doctor
S[n] = np.random.binomial(1, theta[n]*p_skill + (1-theta[n])*(1-p_skill))
#choose doctor assignment to patient n
Z[n,:] = np.random.multinomial(1, beta*S[n]/float(D_sk) + (1-beta)*(1-S[n])/float(D_nsk))
d_n = Z[n,:].argmax()
p_recovery = S[n]*theta[n]*p_l + (1-S[n])*theta[n]*p_vl + (1-theta[n])*p_h
Y[n] = np.random.binomial(1, p_recovery)

return Y, Z, theta, beta

N, D = 1000, 10
Y, Z, theta, beta = sim_data(N, D)
print('Y: total, num recovered',N,Y.sum())

('Y: total, num recovered', 1000, 493.0)

In [78]:
tbl_naive = [(d, int(Z[:,d].sum()), (Y*Z[:,d]).sum()/Z[:,d].sum()) for d in range(D)]
print tabulate(tbl_naive,headers=['doctor id', 'num patients', 'survival rate'],floatfmt=".3f", tablefmt="html")

<table>
<tr><th style="text-align: right;">  doctor id</th><th style="text-align: right;">  num patients</th><th style="text-align: right;">  survival rate</th></tr>
<tr><td style="text-align: right;">          0</td><td style="text-align: right;">           117</td><td style="text-align: right;">          0.205</td></tr>
<tr><td style="text-align: right;">          1</td><td style="text-align: right;">            85</td><td style="text-align: right;">          0.812</td></tr>
<tr><td style="text-align: right;">          2</td><td style="text-align: right;">            93</td><td style="text-align: right;">          0.129</td></tr>
<tr><td style="text-align: right;">          3</td><td style="text-align: right;">            88</td><td style="text-align: right;">          0.216</td></tr>
<tr><td style="text-align: right;">          4</td><td style="text-align: right;">            98</td><td style="text-align: right;">          0.153</td></tr>
<tr><td style="text-align: right;">          5</td><td style="text-align: right;">           112</td><td style="text-align: right;">          0.830</td></tr>
<tr><td style="text-align: right;">          6</td><td style="text-align: right;">            89</td><td style="text-align: right;">          0.809</td></tr>
<tr><td style="text-align: right;">          7</td><td style="text-align: right;">           109</td><td style="text-align: right;">          0.138</td></tr>
<tr><td style="text-align: right;">          8</td><td style="text-align: right;">           102</td><td style="text-align: right;">          0.824</td></tr>
<tr><td style="text-align: right;">          9</td><td style="text-align: right;">           107</td><td style="text-align: right;">          0.841</td></tr>
</table>

In [79]:
def stratify_recovery(Y,Z,N,D,theta):
tau_strat = np.zeros((2,D))
tau_strat[0,:] = np.array([((1-theta)*Y*Z[:,d]).sum()/((1-theta)*Z[:,d]).sum() for d in range(D)])
tau_strat[1,:] = np.array([((theta)*Y*Z[:,d]).sum()/((theta)*Z[:,d]).sum() for d in range(D)])
return tau_strat

tau_strat = stratify_recovery(Y,Z,N,D,theta)
Nds = map(int,np.dot(theta, Z)) #num severely ill people treated by each doctor
print tabulate(zip(range(D),Nds,tau_strat[0,:],tau_strat[1,:]),
headers=['doctor id', 'num severe patients', 'recovery rate | non-severe', 'recovery rate | severe'],
floatfmt=".3f", tablefmt="html")

<table>
<tr><th style="text-align: right;">  doctor id</th><th style="text-align: right;">  num severe patients</th><th style="text-align: right;">  recovery rate | non-severe</th><th style="text-align: right;">  recovery rate | severe</th></tr>
<tr><td style="text-align: right;">          0</td><td style="text-align: right;">                  103</td><td style="text-align: right;">                       0.857</td><td style="text-align: right;">                   0.117</td></tr>
<tr><td style="text-align: right;">          1</td><td style="text-align: right;">                    7</td><td style="text-align: right;">                       0.885</td><td style="text-align: right;">                   0.000</td></tr>
<tr><td style="text-align: right;">          2</td><td style="text-align: right;">                   87</td><td style="text-align: right;">                       0.833</td><td style="text-align: right;">                   0.080</td></tr>
<tr><td style="text-align: right;">          3</td><td style="text-align: right;">                   78</td><td style="text-align: right;">                       0.900</td><td style="text-align: right;">                   0.128</td></tr>
<tr><td style="text-align: right;">          4</td><td style="text-align: right;">                   92</td><td style="text-align: right;">                       1.000</td><td style="text-align: right;">                   0.098</td></tr>
<tr><td style="text-align: right;">          5</td><td style="text-align: right;">                    6</td><td style="text-align: right;">                       0.877</td><td style="text-align: right;">                   0.000</td></tr>
<tr><td style="text-align: right;">          6</td><td style="text-align: right;">                   11</td><td style="text-align: right;">                       0.923</td><td style="text-align: right;">                   0.000</td></tr>
<tr><td style="text-align: right;">          7</td><td style="text-align: right;">                   97</td><td style="text-align: right;">                       0.917</td><td style="text-align: right;">                   0.041</td></tr>
<tr><td style="text-align: right;">          8</td><td style="text-align: right;">                   11</td><td style="text-align: right;">                       0.923</td><td style="text-align: right;">                   0.000</td></tr>
<tr><td style="text-align: right;">          9</td><td style="text-align: right;">                    8</td><td style="text-align: right;">                       0.909</td><td style="text-align: right;">                   0.000</td></tr>
</table>

In [80]:
def patient_match(Y, Z, N, D, theta):
#pick the patients of each doctor and match them randomly with another patient of the same severity (high or low)
#who had another doctor, then find average outcome difference
#question: they're all getting "treated" there are no controls? Then does this still make sense?

tau = np.zeros(N) #noisy treatment effect for each patient

for n in range(N):
#find patients of same severity who were treated by a different doctor:
dn = Z[n,:].argmax() #doctor of patient n
g = (theta*theta[n] + (1-theta)*(1-theta[n]))*(1-Z[:,dn])
#pick one at random uniformly
match = np.random.multinomial(1, g/g.sum()).argmax()
#confirm that they have the same severity and have different doctors (this should always be true)
assert theta[match] == theta[n], 'patients not matched by severity'
assert Z[match,:].argmax() != Z[n,:].argmax(), 'patients have same doctor (%i, %i)' % (match, n)
#calculate difference in outcomes:
tau[n] = Y[n] - Y[match]

#calculate average difference in outcomes over matches for each doctor:
tau_doctor = np.array([(tau*Z[:,d]).sum()/Z[:,d].sum() for d in range(D)])

return tau_doctor

tau_doctor = patient_match(Y, Z, N, D, theta)

In [81]:
print('ground truth skill of doctors',beta)

('ground truth skill of doctors', array([ 1.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  0.,  0.]))

In [82]:
header_match = ['doctor id','avg treatment effect', 'doctor skill', 'num treated', 'avg recovery rate']
tbl_list_match = [(d,t,int(b),int(Z[:,d].sum()),(Y*Z[:,d]).sum()/Z[:,d].sum()) for t,b,d in zip(tau_doctor, beta, range(D))]

<table>
<tr><th style="text-align: right;">  doctor id</th><th style="text-align: right;">  avg treatment effect</th><th style="text-align: right;">  doctor skill</th><th style="text-align: right;">  num treated</th><th style="text-align: right;">  avg recovery rate</th></tr>
<tr><td style="text-align: right;">          0</td><td style="text-align: right;">                 0.000</td><td style="text-align: right;">             1</td><td style="text-align: right;">          117</td><td style="text-align: right;">              0.205</td></tr>
<tr><td style="text-align: right;">          1</td><td style="text-align: right;">                -0.024</td><td style="text-align: right;">             0</td><td style="text-align: right;">           85</td><td style="text-align: right;">              0.812</td></tr>
<tr><td style="text-align: right;">          2</td><td style="text-align: right;">                 0.000</td><td style="text-align: right;">             1</td><td style="text-align: right;">           93</td><td style="text-align: right;">              0.129</td></tr>
<tr><td style="text-align: right;">          3</td><td style="text-align: right;">                 0.023</td><td style="text-align: right;">             1</td><td style="text-align: right;">           88</td><td style="text-align: right;">              0.216</td></tr>
<tr><td style="text-align: right;">          4</td><td style="text-align: right;">                -0.020</td><td style="text-align: right;">             1</td><td style="text-align: right;">           98</td><td style="text-align: right;">              0.153</td></tr>
<tr><td style="text-align: right;">          5</td><td style="text-align: right;">                -0.080</td><td style="text-align: right;">             0</td><td style="text-align: right;">          112</td><td style="text-align: right;">              0.830</td></tr>
<tr><td style="text-align: right;">          6</td><td style="text-align: right;">                 0.034</td><td style="text-align: right;">             0</td><td style="text-align: right;">           89</td><td style="text-align: right;">              0.809</td></tr>
<tr><td style="text-align: right;">          7</td><td style="text-align: right;">                -0.064</td><td style="text-align: right;">             1</td><td style="text-align: right;">          109</td><td style="text-align: right;">              0.138</td></tr>
<tr><td style="text-align: right;">          8</td><td style="text-align: right;">                 0.010</td><td style="text-align: right;">             0</td><td style="text-align: right;">          102</td><td style="text-align: right;">              0.824</td></tr>
<tr><td style="text-align: right;">          9</td><td style="text-align: right;">                 0.037</td><td style="text-align: right;">             0</td><td style="text-align: right;">          107</td><td style="text-align: right;">              0.841</td></tr>
</table>

In [90]:
def propensity_score(Z, N, D, theta):
p = np.zeros((2,D))
p[0,:] = [(Z[:,d]*(1-theta)).sum()/(1-theta).sum() for d in range(D)]
p[1,:] = [(Z[:,d]*(theta)).sum()/(theta).sum() for d in range(D)]
return p

p = propensity_score(Z, N, D, theta)
print('propensity',p)

def avg_tau_propensity_weighted(Y, Z, p, N, D, theta):
tau = np.zeros(D)
itheta = map(int, theta)
#calculate the weighted effect on the treated (by doctor d)
eft = np.array([Y*Z[:,d]/p[itheta,d] for d in range(D)])
#calculate the weighted effect on the untreated (by doctor d)
efu = np.array([Y*(1-Z[:,d])/(1-p[itheta,d]) for d in range(D)])
return eft.mean(axis=1), efu.mean(axis=1), (eft - efu).mean(axis=1)

eft, efu, tau_pw = avg_tau_propensity_weighted(Y, Z, p, N, D, theta)
print tabulate([['non-severe']+list(p[0,:5]),['severe']+list(p[1,:5])],
headers=['unit covariate']+['doc_%i'%d for d in range(5)],
floatfmt=".3f", tablefmt="html")
print tabulate([['non-severe']+list(p[0,5:]),['severe']+list(p[1,5:])],
headers=['unit covariate']+['doc_%i'%d for d in range(5,D)],
floatfmt=".3f", tablefmt="html")
print tabulate(zip(range(D),map(int,beta),eft,efu,tau_pw),
headers=['doctor id','doctor skill','treated recovery','untreated recovery','avg treatment effect'],
floatfmt=".3f", tablefmt="html")

('propensity', array([[ 0.028,  0.156,  0.012,  0.02 ,  0.012,  0.212,  0.156,  0.024,
0.182,  0.198],
[ 0.206,  0.014,  0.174,  0.156,  0.184,  0.012,  0.022,  0.194,
0.022,  0.016]]))
<table>
<tr><th>unit covariate  </th><th style="text-align: right;">  doc_0</th><th style="text-align: right;">  doc_1</th><th style="text-align: right;">  doc_2</th><th style="text-align: right;">  doc_3</th><th style="text-align: right;">  doc_4</th></tr>
<tr><td>non-severe      </td><td style="text-align: right;">  0.028</td><td style="text-align: right;">  0.156</td><td style="text-align: right;">  0.012</td><td style="text-align: right;">  0.020</td><td style="text-align: right;">  0.012</td></tr>
<tr><td>severe          </td><td style="text-align: right;">  0.206</td><td style="text-align: right;">  0.014</td><td style="text-align: right;">  0.174</td><td style="text-align: right;">  0.156</td><td style="text-align: right;">  0.184</td></tr>
</table>
<table>
<tr><th>unit covariate  </th><th style="text-align: right;">  doc_5</th><th style="text-align: right;">  doc_6</th><th style="text-align: right;">  doc_7</th><th style="text-align: right;">  doc_8</th><th style="text-align: right;">  doc_9</th></tr>
<tr><td>non-severe      </td><td style="text-align: right;">  0.212</td><td style="text-align: right;">  0.156</td><td style="text-align: right;">  0.024</td><td style="text-align: right;">  0.182</td><td style="text-align: right;">  0.198</td></tr>
<tr><td>severe          </td><td style="text-align: right;">  0.012</td><td style="text-align: right;">  0.022</td><td style="text-align: right;">  0.194</td><td style="text-align: right;">  0.022</td><td style="text-align: right;">  0.016</td></tr>
</table>
<table>
<tr><th style="text-align: right;">  doctor id</th><th style="text-align: right;">  doctor skill</th><th style="text-align: right;">  treated recovery</th><th style="text-align: right;">  untreated recovery</th><th style="text-align: right;">  avg treatment effect</th></tr>
<tr><td style="text-align: right;">          0</td><td style="text-align: right;">             1</td><td style="text-align: right;">             0.487</td><td style="text-align: right;">               0.489</td><td style="text-align: right;">                -0.003</td></tr>
<tr><td style="text-align: right;">          1</td><td style="text-align: right;">             0</td><td style="text-align: right;">             0.442</td><td style="text-align: right;">               0.495</td><td style="text-align: right;">                -0.053</td></tr>
<tr><td style="text-align: right;">          2</td><td style="text-align: right;">             1</td><td style="text-align: right;">             0.457</td><td style="text-align: right;">               0.494</td><td style="text-align: right;">                -0.037</td></tr>
<tr><td style="text-align: right;">          3</td><td style="text-align: right;">             1</td><td style="text-align: right;">             0.514</td><td style="text-align: right;">               0.489</td><td style="text-align: right;">                 0.025</td></tr>
<tr><td style="text-align: right;">          4</td><td style="text-align: right;">             1</td><td style="text-align: right;">             0.549</td><td style="text-align: right;">               0.491</td><td style="text-align: right;">                 0.058</td></tr>
<tr><td style="text-align: right;">          5</td><td style="text-align: right;">             0</td><td style="text-align: right;">             0.439</td><td style="text-align: right;">               0.497</td><td style="text-align: right;">                -0.058</td></tr>
<tr><td style="text-align: right;">          6</td><td style="text-align: right;">             0</td><td style="text-align: right;">             0.462</td><td style="text-align: right;">               0.492</td><td style="text-align: right;">                -0.030</td></tr>
<tr><td style="text-align: right;">          7</td><td style="text-align: right;">             1</td><td style="text-align: right;">             0.479</td><td style="text-align: right;">               0.498</td><td style="text-align: right;">                -0.019</td></tr>
<tr><td style="text-align: right;">          8</td><td style="text-align: right;">             0</td><td style="text-align: right;">             0.462</td><td style="text-align: right;">               0.492</td><td style="text-align: right;">                -0.030</td></tr>
<tr><td style="text-align: right;">          9</td><td style="text-align: right;">             0</td><td style="text-align: right;">             0.455</td><td style="text-align: right;">               0.493</td><td style="text-align: right;">                -0.038</td></tr>
</table>