#!/usr/bin/env python # coding: utf-8 # In[76]: import numpy as np import random from tabulate import tabulate # In[77]: def sim_data(N, D, p_vl=0.01, p_l=0.1, p_h=0.9, p_skill=0.9): #N: num patients #D: num doctors #p_vl, p_l, p_h: very low, low, and high probability (of recovery) #p_skill: probability of patient being assigned a doctor with skill matching severity #split doctors into two sets and make exactly half of them highly skilled beta = np.zeros(D) D_sk, D_nsk = D/2, D - D/2 #num skilled v.s. not skilled doctors, account for rounding error when D is odd beta[random.sample(range(D),D_sk)] = 1. #split patients into two sets and make exactly half of them highly severe theta = np.zeros(N) theta[random.sample(range(N),N/2)] = 1. S = np.zeros(N) #skill level for each patient Z = np.zeros((N,D)) #patient-doctor assignment matrix Y = np.zeros(N) #outcome: did the patient recover? for n in range(N): #for each patient choose the skill level of their doctor S[n] = np.random.binomial(1, theta[n]*p_skill + (1-theta[n])*(1-p_skill)) #choose doctor assignment to patient n Z[n,:] = np.random.multinomial(1, beta*S[n]/float(D_sk) + (1-beta)*(1-S[n])/float(D_nsk)) d_n = Z[n,:].argmax() p_recovery = S[n]*theta[n]*p_l + (1-S[n])*theta[n]*p_vl + (1-theta[n])*p_h Y[n] = np.random.binomial(1, p_recovery) return Y, Z, theta, beta N, D = 1000, 10 Y, Z, theta, beta = sim_data(N, D) print('Y: total, num recovered',N,Y.sum()) # In[78]: tbl_naive = [(d, int(Z[:,d].sum()), (Y*Z[:,d]).sum()/Z[:,d].sum()) for d in range(D)] print tabulate(tbl_naive,headers=['doctor id', 'num patients', 'survival rate'],floatfmt=".3f", tablefmt="html") # In[79]: def stratify_recovery(Y,Z,N,D,theta): tau_strat = np.zeros((2,D)) tau_strat[0,:] = np.array([((1-theta)*Y*Z[:,d]).sum()/((1-theta)*Z[:,d]).sum() for d in range(D)]) tau_strat[1,:] = np.array([((theta)*Y*Z[:,d]).sum()/((theta)*Z[:,d]).sum() for d in range(D)]) return tau_strat tau_strat = stratify_recovery(Y,Z,N,D,theta) Nds = map(int,np.dot(theta, Z)) #num severely ill people treated by each doctor print tabulate(zip(range(D),Nds,tau_strat[0,:],tau_strat[1,:]), headers=['doctor id', 'num severe patients', 'recovery rate | non-severe', 'recovery rate | severe'], floatfmt=".3f", tablefmt="html") # In[80]: def patient_match(Y, Z, N, D, theta): #pick the patients of each doctor and match them randomly with another patient of the same severity (high or low) #who had another doctor, then find average outcome difference #question: they're all getting "treated" there are no controls? Then does this still make sense? tau = np.zeros(N) #noisy treatment effect for each patient for n in range(N): #find patients of same severity who were treated by a different doctor: dn = Z[n,:].argmax() #doctor of patient n g = (theta*theta[n] + (1-theta)*(1-theta[n]))*(1-Z[:,dn]) #pick one at random uniformly match = np.random.multinomial(1, g/g.sum()).argmax() #confirm that they have the same severity and have different doctors (this should always be true) assert theta[match] == theta[n], 'patients not matched by severity' assert Z[match,:].argmax() != Z[n,:].argmax(), 'patients have same doctor (%i, %i)' % (match, n) #calculate difference in outcomes: tau[n] = Y[n] - Y[match] #calculate average difference in outcomes over matches for each doctor: tau_doctor = np.array([(tau*Z[:,d]).sum()/Z[:,d].sum() for d in range(D)]) return tau_doctor tau_doctor = patient_match(Y, Z, N, D, theta) # In[81]: print('ground truth skill of doctors',beta) # In[82]: header_match = ['doctor id','avg treatment effect', 'doctor skill', 'num treated', 'avg recovery rate'] tbl_list_match = [(d,t,int(b),int(Z[:,d].sum()),(Y*Z[:,d]).sum()/Z[:,d].sum()) for t,b,d in zip(tau_doctor, beta, range(D))] print tabulate(tbl_list_match, headers=header_match,floatfmt=".3f", tablefmt="html") # In[ ]: # In[90]: def propensity_score(Z, N, D, theta): p = np.zeros((2,D)) p[0,:] = [(Z[:,d]*(1-theta)).sum()/(1-theta).sum() for d in range(D)] p[1,:] = [(Z[:,d]*(theta)).sum()/(theta).sum() for d in range(D)] return p p = propensity_score(Z, N, D, theta) print('propensity',p) def avg_tau_propensity_weighted(Y, Z, p, N, D, theta): tau = np.zeros(D) itheta = map(int, theta) #calculate the weighted effect on the treated (by doctor d) eft = np.array([Y*Z[:,d]/p[itheta,d] for d in range(D)]) #calculate the weighted effect on the untreated (by doctor d) efu = np.array([Y*(1-Z[:,d])/(1-p[itheta,d]) for d in range(D)]) return eft.mean(axis=1), efu.mean(axis=1), (eft - efu).mean(axis=1) eft, efu, tau_pw = avg_tau_propensity_weighted(Y, Z, p, N, D, theta) print tabulate([['non-severe']+list(p[0,:5]),['severe']+list(p[1,:5])], headers=['unit covariate']+['doc_%i'%d for d in range(5)], floatfmt=".3f", tablefmt="html") print tabulate([['non-severe']+list(p[0,5:]),['severe']+list(p[1,5:])], headers=['unit covariate']+['doc_%i'%d for d in range(5,D)], floatfmt=".3f", tablefmt="html") print tabulate(zip(range(D),map(int,beta),eft,efu,tau_pw), headers=['doctor id','doctor skill','treated recovery','untreated recovery','avg treatment effect'], floatfmt=".3f", tablefmt="html") # In[ ]: # In[ ]: # In[ ]: