Here we load in the data from Nan.
#requires from python: matplotlib, numpy and scipy
# from debian: ffmpeg and libx264
import autoPlanLib as myTools #this has some custom functions to clean up the code here
#load patient DVH data (settings are in autoPlanLib.py)
patients = []
# read ting's data
# for PTV DVH line pairs are dose and volume (for 24 patients)
f = open('ting_data/DVH_PTV_23Patients.txt')
parity = 0 #tracks line pairs
for line in f.readlines():
if len(line) >1:
if parity == 0:
#we have a new pair of dose,volume lines (and a new patient)
patients.append([])
patients[-1].append(myTools.Organ())
#start with dose
patients[-1][-1].name = 'PTV'
patients[-1][-1].dose = map(lambda x: float(x), line.rstrip().split(' '))
patients[-1][-1].n = len(patients[-1][-1].dose)
#increment parity
parity = 1
else:
#this is volume data for the above dose data
patients[-1][-1].volume = map(lambda x: float(x)/100., line.rstrip().split(' '))
#increment parity
parity = 0
#now read in the Rectum-DWH
f = open('ting_data/Rectum-DWHData_23Patients.txt')
#the first line is doses:
doses = map(lambda x: float(x), f.readline().rstrip().split(' '))
pid = 0
for line in f.readlines():
if len(line.rstrip()) >1:
#this is wall coverage
patients[pid].append(myTools.Organ())
patients[pid][-1].name = 'Rectum DWH'
patients[pid][-1].n = len(doses)
patients[pid][-1].dose = doses
patients[pid][-1].volume = map(lambda x: float(x)/100., line.rstrip().split(' '))
pid += 1
f.close()
print "total patients:%i" % pid
f.close()
print "total patients:%i" % len(patients)
#now read in the bladder-DWH
f = open('ting_data/Bladder-DWHData_23Patients.txt')
#the first line is doses:
doses = map(lambda x: float(x), f.readline().rstrip().split(' '))
pid = 0
for line in f.readlines():
if len(line.rstrip()) >1:
#this is wall coverage
patients[pid].append(myTools.Organ())
patients[pid][-1].name = 'Bladder DWH'
patients[pid][-1].n = len(doses)
patients[pid][-1].dose = doses
patients[pid][-1].volume = map(lambda x: float(x)/100., line.rstrip().split(' '))
pid += 1
f.close()
print "total patients:%i" % pid
total patients:23 total patients:23 total patients:23
myTools = reload(myTools) #when needed
#Some useful parameters
Rx_dose = 100
num_pat = size(patients,0) #number of patients in library (60)
num_orgs = 3 #number of organs #len(patients[pat])
Here we can see the range of the DVH for each organ.
#plotting DVH of library
num_pat = 23
fig, ax = plt.subplots(1,3,figsize=(18,5))
for pat in range(num_pat):
for org in range(num_orgs):
data = patients[pat][org]
ax[org].plot(data.dose,multiply(data.volume,100.0))#,organ_fmt[o])
if pat == num_pat - 1:
if org == 0:
ax[org].set_title("All " + data.name)
ax[org].set_xlabel('Dose (cGy)')
ax[org].set_ylabel('Volume (%)')
ax[org].set_ylim(0,100)
else:
ax[org].set_title("All " + data.name)
ax[org].set_xlabel('Dose (cGy)')
ax[org].set_ylabel('Area (%)')
ax[org].set_ylim(0,100)
Here we have some freedom in how we re-sample the data for use in PCA. PCA requires a data matrix, each row being a observation (patient), and each column representing some measurement. The PCA library we use also assumes that the number of columns in the data matrix is less than or equal to the number or rows.
A few different sampling schemes were tested, but in the end they all gave very similar results.
Below is a scheme to weight the sampling grid quadratically towards lower volume (%) values (higher dose regions).
# say we want to weight samples by x^2
# integrate, then invert we get 3/x^3, then uniformly sample
n_pts = 20
grid_points = zeros((num_orgs,n_pts))
g = mgrid[0.0:1.0:(1./float(n_pts))] #a uniform grid with desired number of sampling points
sampling = map(lambda x: pow(x,1.0/3.0)*100, g) #map uniform grid through weighting function
sampling = abs(array(sampling)-98) #here the sampling points are effectively reversed and shifted
print "Volume sampling points (%):"
print sampling
#Vols for PTV, RECT and BLAD:
for i in range(3):
grid_points[i] = sampling / 100.0
Below are 2 hand picked sampling scheme which treats the PTV and OARS differently
n_pts = 12 #points sampled from each organ
grid_points = zeros((num_orgs,n_pts))
#Vols for PTV:
grid_points[0] = array( (99,98,97,96,95,90,50, 5, 4, 3, 2, 1) ) / 100.0
#Vols for RECT:
grid_points[1] = array( (90,70,60,50,40,30,20,10, 9, 7, 5, 1) ) / 100.0
#Vols for BLAD:
grid_points[2] = array( (90,70,60,50,40,30,20,10, 9, 7, 5, 1) ) / 100.0
print "Volume sampling points (%):"
print 100*grid_points
#another hand picked sampling scheme with fiewer points (for simplicity)
n_pts = 6 #points sampled from each organ
grid_points = zeros((num_orgs,n_pts))
#Vols for PTV:
grid_points[0] = array( (99,95,90,70,50,2) ) / 100.0
#Vols for RECT:
grid_points[1] = array( (90,60,30,20,10,5) ) / 100.0
#Vols for BLAD:
grid_points[2] = array( (90,60,30,20,10,5) ) / 100.0
print "Volume sampling points (%):"
print 100*grid_points
This makes the final result (when we reconstruct DVH from the PCA vectors) more eye pleasing.
#a uniform sampling scheme with 20 sampling points per organ
n_pts = 7
low_vol = 1.
high_vol = 99.
a = frange(low_vol, high_vol, (high_vol-low_vol)/float(n_pts-1))
a = a[::-1] #reversed
grid_points = repeat([a/100.],3,axis=0)
#Plot the grid
grid_points100 = 100*grid_points #this gives %dose units
plot(grid_points100[0],grid_points100[0],'o')
title('Sampling points for Volume (%)')
show()
Below, we re-sample the patient DVH data using the above volume grid points. No interpolation is used, we simply find the dose value where the DVH volume first drops below the sought volume value.
gridDim = len(grid_points[0])
V_grid = zeros((num_pat,num_orgs*gridDim))
for pat in range(num_pat):
for org in range(num_orgs):
data = patients[pat][org]
for i in range(len(grid_points[org])):
V_grid[pat,org*gridDim+i]=myTools.DoseAtVol(data.dose,data.volume,grid_points[org][i])/Rx_dose
#this defines the indices for each organ in the rows of V_grid
r = (range(0,n_pts),range(n_pts,2*n_pts),range(2*n_pts,3*n_pts))
fig, axes = plt.subplots(1,3,figsize=(18,4))
for pat in range(num_pat):
axes[0].plot( V_grid[pat,r[0]], grid_points100[0],'-m', alpha=.3)
axes[1].plot( V_grid[pat,r[1]], grid_points100[1],'-r', alpha=.3)
axes[2].plot( V_grid[pat,r[2]], grid_points100[2],'-b', alpha=.3)
#axes[i,2].plot([],[],'--k',label='Neg(%i)'% m_idx)
#axes.plot(V_grid[p,:],'.',alpha = 0.5)
axes[0].set_title(patients[0][0].name)
map(lambda x: x.set_xlabel('Dose/RxDose'),axes)
axes[0].set_ylabel('Volume (%)')
axes[1].set_title(patients[0][1].name)
axes[2].set_title(patients[0][2].name)
show()
MatPlotLib has a built in PCA function. Below are the variances (extent of data along given axis) plotted for each of Principal Component (PC) directions.
from matplotlib.mlab import PCA
V_pca = PCA(V_grid)
scl = 1.2;
figure(figsize=(scl*7,scl*5))
title('PCA Fractions')
plot(V_pca.fracs*100,'or') # the proportion of variance of each of the principal components
ylabel("Variance (%)")
xlabel("Principal Component")
ylim(-1,46)
xlim(-1.0,20.5)
show()
#scores for the first 3 modes for each patient
print"PC0 PC1 PC2 PC3 PC4 PC5"
for p in range(len(patients)):
print "%f %f %f %f %f %f" % (V_pca.Y[p,0],V_pca.Y[p,1],V_pca.Y[p,2],V_pca.Y[p,3],V_pca.Y[p,4],V_pca.Y[p,5])
PC0 PC1 PC2 PC3 PC4 PC5 9.175882 0.663812 0.481863 1.149688 2.464962 0.905728 4.363763 -0.468688 -1.388054 -2.029392 -0.909694 1.236813 -0.643974 -3.205325 -1.030343 1.459868 -0.700601 0.067068 0.016273 -1.182734 0.334291 -1.492143 -0.721783 0.305363 0.735720 -2.561788 0.981450 -1.309403 -0.212288 -0.212751 -0.435824 -2.728386 0.574765 0.592307 -0.146848 0.044809 0.273958 1.864062 -1.221095 -1.971976 -1.555848 0.820391 -0.092533 2.758174 -1.138946 1.136184 -0.085078 0.661712 -4.105368 -1.387377 -1.273561 -0.224999 0.396177 -0.661652 -0.083079 2.660517 -1.045863 -2.744873 0.444727 0.052543 -3.039112 -1.004980 0.889876 0.428694 0.088355 0.011090 -1.756426 0.851387 2.104856 -0.692612 0.562391 0.023461 -1.446658 1.399865 2.790175 0.110173 -1.754612 -0.120992 3.262356 3.132581 -0.372928 1.380879 -2.496753 -2.380364 -1.893900 1.847863 1.203215 1.006428 0.084566 -0.004083 -0.032347 -1.148412 2.453693 0.444699 0.060279 0.359910 1.542104 1.511919 0.545571 1.167029 -0.034101 0.131839 -5.543866 2.666806 -0.041786 1.028308 1.357742 1.814570 1.247495 -2.816382 1.225766 -0.020452 -0.451306 0.207070 -2.263550 0.276388 0.007659 -1.916210 2.607835 -2.229633 -2.948097 -0.377823 -3.168764 1.554737 0.374077 -0.207425 -0.139774 -2.163018 -2.174001 0.448559 -0.727158 0.409859 3.806957 -0.588460 -0.737839 0.494507 1.354961 -1.235328
The plot below gives a hint as to how the data matrix is sorted. Each row represents a patient DVH and the first 1/3 of the columns sample the PTV, the middle 1/3 sample the Rectum, and the last 1/3 sample the Bladder.
The PCA modes represent (unit) directions in the high dimensional data space (after the mean for each column has been subtracted and the standard deviation is divided out).
n_comps = 6
p = V_pca
fig, axes = plt.subplots(figsize=(18,7))
for i in range(n_comps):
axes.plot(p.Wt[i,:],label='PC %i'%i)
axes.plot(zeros(num_orgs*n_pts),':k')
axes.set_xlabel('Volume (%) for PTV (Purple), Rectum (Red), Bladder (Blue)')
axes.set_ylabel('weights/correlations (arb)')
axes.set_title('First %i PCA modes' % n_comps)
axes.legend(loc=0)
#format Ticks
axes.set_xticks(range(3*n_pts))
#make integer axis of Vol axis
l = axes.set_xticklabels(array(concatenate((grid_points100[0],grid_points100[1],grid_points100[2])),int)) #output hidden
junk = map(lambda x: x.set_color('purple'), l[:n_pts] )
junk = map(lambda x: x.set_color('red'), l[n_pts:2*n_pts] )
junk = map(lambda x: x.set_color('blue'), l[2*n_pts:] )
Below we plot the PCA mode in the first column, the component along the mode for each patient in the second column, and the 'extreemum' DVH (Three patients with the most negative, closest to zero, and most positive components along the PCA mode)
n_comps = 6
p = V_pca
fig2, axes2 = plt.subplots(n_comps,3,figsize=(18,n_comps*6))
for i in range(n_comps):
axes2[i,0].plot( zeros((len(grid_points[0]),1)),grid_points100[0],':k')
axes2[i,0].plot(p.Wt[i,r[0]],grid_points100[0],'-m',label="PTV")
axes2[i,0].plot(p.Wt[i,r[1]],grid_points100[1],'-r',label="Rectum")
axes2[i,0].plot(p.Wt[i,r[2]],grid_points100[2],'-b',label="Bladder")
axes2[i,0].set_title('Principal Componant Vector # %i' % i)
axes2[i,0].set_ylabel('Volume (%)')
axes2[i,0].set_xlabel('Weight/Correlation (arb)')
axes2[i,0].legend(loc=0)
#plot PC dimension (componants for each patient)
axes2[i,1].plot( p.Y[:,i],p.Y[:,i],'sr',mew=0,ms=20,alpha = 0.15)
axes2[i,1].set_title('PC %i vs. PC %i (squares are patients)' % (i,i))
axes2[i,1].set_xlabel('Weight/Correlation (arb)')
#Find Extremum Patients for PC mode (Most Negative, Closest to Zero, and Most Positive)
neg_idx = find(min(p.Y[:,i]) == p.Y[:,i])
tempArray = map(lambda x: abs(x), p.Y[:,i]) #map absolute value
zero_idx = find(min(tempArray) == tempArray)
pos_idx = find(max(p.Y[:,i]) == p.Y[:,i])
c_fmt = ('m','r','b') #line colors for PTV, Rect, and Blad
axes2[i,2].plot([],[],'--k',label='Neg(%i)'% neg_idx)
axes2[i,2].plot([],[],':k',label='Zero(%i)' % zero_idx)
axes2[i,2].plot([],[],'-k',label='Pos(%i)' % pos_idx)
for org in range(3):
axes2[i,2].plot( patients[neg_idx][org].dose, 100.*array(patients[neg_idx][org].volume),'--'+c_fmt[org])
axes2[i,2].plot( patients[zero_idx][org].dose, 100.*array(patients[zero_idx][org].volume),':'+c_fmt[org])
axes2[i,2].plot( patients[pos_idx][org].dose, 100.*array(patients[pos_idx][org].volume),'-'+c_fmt[org])
axes2[i,2].set_xlim((0,Rx_dose*1.2))
axes2[i,2].set_ylim((0,110))
axes2[i,2].set_title('Extremum DVH along PC %i ' % i)
axes2[i,2].set_xlabel('Dose (cGy)')
axes2[i,2].set_ylabel('Volume (%)')
axes2[i,2].legend(loc='upper center',ncol=3,frameon=False)
Here I plot the DVH data in the PC space of the first 3 modes.
#3D scatter in PCV (0,1,3) space
from mpl_toolkits.mplot3d.axes3d import Axes3D
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1, 1, 1, projection='3d')
pY = V_pca.Y
p = ax.scatter(pY[:,0], pY[:,1], pY[:,2],s=60,c='r',marker='o',edgecolors='none',alpha = 0.5)
#maybe try animation later
p = V_pca
proj = p.project(V_grid[0,:])
temp = zeros(proj.shape)
errors = zeros(proj.shape)
n_pats = len(patients) #number of patients to consider
n_pc = 6 #number of pca modes to include
#placeholders to find max error patient
max_error = 0.0
max_error_row = []
max_error_pca = []
for pat in range(n_pats):
row = V_grid[pat,:]
proj = p.project(row)
temp = zeros(proj.shape)
for i in range(n_pc):
temp = temp + multiply(p.Wt[i,:],proj[i])
#compute error between pca reduced representation
temp_err = abs(temp*p.sigma + p.mu - row)
#compute max error and compare to global
if max(temp_err) > max_error:
#save patient index
max_error_idx = pat
max_error_row = row.copy()
max_error_pca = temp.copy()
errors = errors + temp_err
#plot PCA errors
fig, ax = plt.subplots(1,2,figsize=(16,7))
ax[0].plot(100*errors[r[0]]/float(n_pats),grid_points100[0],':om',label="PTV")
ax[0].plot(100*errors[r[1]]/float(n_pats),grid_points100[0],':or',label="Rectum")
ax[0].plot(100*errors[r[2]]/float(n_pats),grid_points100[0],':ob',label="Bladder")
ax[0].set_title('Average Error with first %i PCA modes' % n_pc)
ax[0].legend(loc='best')
ax[0].set_ylabel('Volume (%)')
ax[0].set_xlabel('Absolute Error (% of RxDose)')
#example Origional and Reduced
dvh_pca = max_error_pca*p.sigma + p.mu
ax[1].plot(Rx_dose * dvh_pca[r[0]],grid_points100[0],'-m',label="PTV")
ax[1].plot(Rx_dose * dvh_pca[r[1]],grid_points100[1],'-r',label="Rectum")
ax[1].plot(Rx_dose * dvh_pca[r[2]],grid_points100[2],'-b',label="Bladder")
#plot(max_error_pca*p.sigma + p.mu)
ax[1].plot(Rx_dose * max_error_row[r[0]],grid_points100[0],'--m')
ax[1].plot(Rx_dose * max_error_row[r[1]],grid_points100[1],'--r')
ax[1].plot(Rx_dose * max_error_row[r[2]],grid_points100[2],'--b')
ax[1].plot([],[],'--k',label='Orig.')
ax[1].plot([],[],'-k',label='Proj.')
ax[1].set_ylabel('Volume (%)')
ax[1].set_xlabel('Dose (cGy)')
ax[1].legend(loc=0,ncol=2)
ax[1].set_title('Worst Max Error (Patient %i)' % max_error_idx)
show()