Kyle Cranmer, Nov 19, 2015
Based on Estimating the significance of a signal in a multi-dimensional search by Ofer Vitells and Eilam Gross http://arxiv.org/pdf/1105.4355v1.pdf
This is for the special case of a likelihood function of the form $L(\mu, \nu_1, \nu_2)$ where $\mu$ is a single parameter of interest and $\nu_1,\nu_2$ are two nuisance parameters that are not identified under the null. For example, $\mu$ is the signal strength of a new particle and $\nu_1,\nu_2$ are the unknown mass and width of the new particle. Under the null hypothesis, those parameters don't mean anything... aka they "are not identified under the null" in the statistics jargon. This introduces a 2-d look elsewhere effect.
The LEE correction in this case is based on
\begin{equation} E[ \phi(A_u) ] = P(\chi^2_1 > u) + e^{-u/2} (N_1 + \sqrt{u} N_2) \, \end{equation}where
The notebook is broken into two parts.
%pylab inline --no-import-all
from scipy.optimize import fsolve
from scipy.stats import chi2
from numpy import exp, sqrt
from scipy.ndimage.morphology import *
from scipy.ndimage import *
from scipy.stats import norm
import ROOT
Populating the interactive namespace from numpy and matplotlib
Welcome to ROOTaaS 6.05/03
def expectedEuler(u, n1, n2):
return chi2.cdf(u, 1) + np.exp(-u/2)*(n1+n2*np.sqrt(u))
def equations(p,exp_phi_1, exp_phi_2,u1, u2):
n1,n2 = p
return (exp_phi_1-expectedEuler(u1,n1,n2), exp_phi_2-expectedEuler(u2,n1,n2))
Usage: calculate n1,n2 based on expected value of Euler characteristic (calculated from toy Monte Carlo) at two different levels u1, u2. For example:
would lead to a call like this
n1, n2 = fsolve(equations, (1,1), args=(33.5, 94.6, 0., 1.))
def global_pvalue(u,n1,n2):
return expectedEuler(u,n1,n2)-1.
def do_LEE_correction(max_local_sig, u1, u2, exp_phi_1, exp_phi_2):
n1, n2 = fsolve(equations, (1,1), args=(exp_phi_1, exp_phi_2, u1,u2))
this_global_p = global_pvalue(max_local_significance**2, n1, n2)
print ' n1, n2 =', n1, n2
print ' local p_value = %f, local significance = %f' %(norm.cdf(-max_local_significance), max_local_significance)
print 'global p_value = %f, global significance = %f' %(this_global_p, -norm.ppf(this_global_p))
def calculateEulerCharacteristic(a):
face_filter=np.zeros((2,2))+1
right_edge_filter = np.array([[1,1]])
bottom_edge_filter = right_edge_filter.T
n_faces = np.sum(convolve(a,face_filter,mode='constant')>3)
n_edges = np.sum(convolve(a,right_edge_filter,mode='constant')>1)
n_edges += np.sum(convolve(a,bottom_edge_filter,mode='constant')>1)
n_vertices = np.sum(a>0)
EulerCharacteristic = n_vertices-n_edges+n_faces
print '%d-%d+%d=%d' %(n_vertices,n_edges,n_faces,EulerCharacteristic)
return n_vertices-n_edges+n_faces
# An example from the paper
n1, n2 = fsolve(equations, (1,1), args=(33.5, 94.6, 0., 1.))
print n1, n2
print equations((n1,n2),33.5, 94.6, 0., 1.)
#small difference wrt the paper, where n2 = 123
33.5 121.343467521 (0.0, -1.4210854715202004e-14)
# reproduce Fig 5 from paper (the markers are read by eye)
u = np.linspace(5,35,100)
global_p = global_pvalue(u,n1,n2)
plt.plot(u, global_p)
plt.scatter(35,2.E-5) #from Fig5
plt.scatter(30,2.E-4) #from Fig5
plt.scatter(25,2.5E-3) #from Fig5
plt.scatter(20,2.5E-2) #from Fig5
plt.scatter(15,.3) #from Fig5
plt.xlabel('u')
plt.ylabel('P(max q > u)')
plt.semilogy()
[]
#create Fig 3 of http://arxiv.org/pdf/1105.4355v1.pdf
a = np.zeros((7,7))
a[1,2]=a[1,3]=a[2,1]=a[2,2]=a[2,3]=a[2,4]=1
a[3,1]=a[3,2]=a[3,3]=a[3,4]=a[3,5]=1
a[4,1]=a[4,2]=a[4,3]=a[4,4]=1
a[5,3]=1
a[6,0]=a[6,1]=1
a=a.T
plt.imshow(a,cmap='gray',interpolation='none')
<matplotlib.image.AxesImage at 0x121ac24d0>
#should be 2
calculateEulerCharacteristic(a)
18-23+7=2
2
#Fully filled, should be 1
randMatrix = np.zeros((100,100))+1
calculateEulerCharacteristic(randMatrix)
10000-19800+9801=1
1
# split in half vertically, should be 2
randMatrix[50,:]=0
plt.imshow(randMatrix,cmap='gray')
calculateEulerCharacteristic(randMatrix)
9900-19501+9603=2
2
#split in half horizontally twice, should be 6
randMatrix[:,25]=0
randMatrix[:,75]=0
plt.imshow(randMatrix,cmap='gray')
calculateEulerCharacteristic(randMatrix)
9702-18911+9215=6
6
#remove a hole from middle of one, should be 5
randMatrix[25:30,50:53]=0
plt.imshow(randMatrix,cmap='gray')
calculateEulerCharacteristic(randMatrix)
9687-18873+9191=5
5
# Specify the necessary info
max_local_significance = 4.
root_file_name = 'FinalToys.root'
num_toy_scans = 26
names_of_toy_likleihood_scans = [('scan_toy_cond%d' %(i)) for i in range(11,num_toy_scans)]
# choose u1, u2 thresholds for doing the scan.
# these are arbitrary
# if tehre are enough toys the choice shouldn't matter, but
# we may want to do some tests with other choices
this_u1, this_u2 = 0.1, 0.9
# for testing, comment this out for normal use
#names_of_toy_likleihood_scans = [('scan_toy%d_sub' %(i)) for i in [0,0,0,0]]
def convert_hist_to_numpy(hist):
''' a little helper script'''
temp = np.zeros((hist.GetNbinsX(), hist.GetNbinsY()))
for i in range(temp.shape[0]):
for j in range(temp.shape[1]):
temp[i,j] = hist.GetBinContent(i+1, j+1)
return temp
# Read in histograms, convert them to numpy arrays
inFile = ROOT.TFile(root_file_name, 'READ')
likelihoodScans = []
for histName in names_of_toy_likleihood_scans:
inHist = inFile.Get(histName)
temp = convert_hist_to_numpy(inHist)
likelihoodScans.append(temp)
def getEulerCharacteristics(listOfScans, u1=0.1, u2=0.9):
'''loop through the likleihood scans and calculate expectation
of Euler characteristic for excursion sets above levels u1, u2'''
plt.figure(figsize=(15,5*15))
phis = np.zeros((len(listOfScans),2))
for scan_no, scan in enumerate(listOfScans):
plt.subplot(len(listOfScans),3,3*scan_no+1)
plt.imshow(scan.T, cmap='gray', aspect=15)
#get excursion sets above those two levels
exc1 = (scan>u1) + 0. #add 0. to convert from bool to double
exc2 = (scan>u2) + 0.
print 'u1,u2 = ', u1, u2
plt.subplot(len(listOfScans),3,3*scan_no+2)
plt.imshow(exc1.T, cmap='gray', aspect=15)
plt.subplot(len(listOfScans),3,3*scan_no+3)
plt.imshow(exc2.T, cmap='gray', aspect=15)
phi1 = calculateEulerCharacteristic(exc1)
phi2 = calculateEulerCharacteristic(exc2)
print phi1, phi2
phis[scan_no] = [phi1, phi2]
plt.savefig('islands.pdf')
print 'Exp phi_0=%f, phi_2=%f' %(mean(phis[:,0]), mean(phis[:,1]))
return mean(phis[:,0]), mean(phis[:,1])
expphi1, expphi2 = getEulerCharacteristics(likelihoodScans, u1=this_u1, u2=this_u2)
u1,u2 = 0.1 0.9 19194-37439+18262=17 8155-15714+7586=27 17 27 u1,u2 = 0.1 0.9 14631-28502+13886=15 10137-19714+9585=8 15 8 u1,u2 = 0.1 0.9 16773-32718+15957=12 6256-11930+5689=15 12 15 u1,u2 = 0.1 0.9 16931-32941+16024=14 6493-12419+5943=17 14 17 u1,u2 = 0.1 0.9 17421-33848+16447=20 6749-12923+6195=21 20 21 u1,u2 = 0.1 0.9 17694-34521+16845=18 9442-18231+8806=17 18 17 u1,u2 = 0.1 0.9 12003-23360+11376=19 6208-12093+5898=13 19 13 u1,u2 = 0.1 0.9 14879-29013+14154=20 8974-17472+8508=10 20 10 u1,u2 = 0.1 0.9 17099-33396+16315=18 8836-17120+8299=15 18 15 u1,u2 = 0.1 0.9 18214-35433+17241=22 6126-11697+5589=18 22 18 u1,u2 = 0.1 0.9 16529-32065+15558=22 7543-14517+6995=21 22 21 u1,u2 = 0.1 0.9 21077-41069+20006=14 4970-9378+4436=28 14 28 u1,u2 = 0.1 0.9 18591-36219+17652=24 10685-20724+10052=13 24 13 u1,u2 = 0.1 0.9 22817-44546+21738=9 13328-25875+12565=18 9 18 u1,u2 = 0.1 0.9 17403-33880+16495=18 9820-18960+9158=18 18 18 Exp phi_0=17.466667, phi_2=17.266667
do_LEE_correction(max_local_significance, this_u1, this_u2, expphi1, expphi2)
n1, n2 = 14.1275610834 12.5660894698 local p_value = 0.000032, local significance = 4.000000 global p_value = 0.021538, global significance = 2.022977
max_local_significance=4.96
do_LEE_correction(max_local_significance, this_u1, this_u2, expphi1, expphi2)
n1, n2 = 14.1275610834 12.5660894698 local p_value = 0.000000, local significance = 4.960000 global p_value = 0.000347, global significance = 3.391921
max_local_significance=3.87
do_LEE_correction(3.87, this_u1, this_u2, expphi1, expphi2)
n1, n2 = 14.1275610834 12.5660894698 local p_value = 0.000054, local significance = 3.870000 global p_value = 0.035005, global significance = 1.811845