import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
from sklearn.cluster import MeanShift, estimate_bandwidth
import pandas as pd
from scipy import stats
from scipy.stats import beta
from math import sin
from random import randint
import matplotlib.pyplot as plt
import itertools as it
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
import collections
def recursively_default_dict():
return collections.defaultdict(recursively_default_dict)
How do you measure the overlap of two distributions?
This is a frequently asked question, over which a remarkable number of studies have been published. When faced with a structured data set, the degree, location and timing of contacts between populations we are focusing on can often serve as a direct answer to our questions, or offer a path for further research.
Existing metrics for this purpose usually take into account the means, standard deviations and sizes of the populations involved, the combinations of which are matched against known distributions in order to extract their significance (Fisher or Student's t distributions are the common resort).
The use of summary statistics as the basis for these metrics was related to the computing power available to the researchers that developped them, which, for most, was very limited. What i propose here is a pretty bruttish approach to the matter, but one that takes advantage of our modern tools and processors.
We will generate three clouds of points (the possible output of dimensionality reduction on a very neat data set). Two of these populations will have their distance proportional to the sinusoide of an 'X' variable (time, physical location, your choice).
## creating a stable pop:
cov_pop1= [[2,0,0],[0,1,0],[0,0,1]]
cov_pop2= [[1,0,0],[0,.3,0],[0,0,.1]]
cov_pop3= [[1,0,0],[0,.3,0],[0,0,.1]]
### Number of values:
Sizes= [170,130,30]
labels= [2,0,1]
label_vector= label_vector= np.repeat(np.array([x for x in labels]),Sizes)
def jumpingJack(x):
height= sin(x) * 5
mean_pop1= [2,0,7]
mean_pop3= [0,10,height]
mean_pop2= [0,10,-height]
Pop1= np.random.multivariate_normal(mean_pop1, cov_pop1, Sizes[0])
Pop2= np.random.multivariate_normal(mean_pop2, cov_pop2, Sizes[1])
feature= np.vstack((Pop1,Pop2))
Pop3= np.random.multivariate_normal(mean_pop3, cov_pop3, Sizes[2])
data= np.vstack((feature,Pop3))
fig_data= [go.Scatter3d(
x = data[[x for x in range(sum(Sizes)) if label_vector[x] == i],0],
y = data[[x for x in range(sum(Sizes)) if label_vector[x] == i],1],
z = data[[x for x in range(sum(Sizes)) if label_vector[x] == i],2],
mode= "markers",
marker= {
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": .8
},
name= str(i)
) for i in labels]
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)
interact(jumpingJack,x=(0,30,1))
Failed to display Jupyter Widget of type interactive
.
If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.
If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.
<function __main__.jumpingJack>
We have three sets of points, two of which see their distribution overlap as a function of 'X'.
What i wanted out of this was something akin to conditional probabilities:
And once again i was brought back to KDE.
O, the whole of the space occupied by two distributions, we approximate using a meshgrid on the min and max of each cloud in each dimension covered.
The coarsness of this grid we set as P. Then N, the total number of points in this grid, is equal to P ** the number of dimensions.
Then, for every combination of pairs of populations, we can determine which gridpoints have significant densities relative to each distribution. I set this threshold to .005, and normalized densities by those of the actual data points they were drawn from.
It then becomes a matter of P(A|B) = P(A int B) / P(B), same for P(B|A), and Empty= N - P(A) - P(B) + P(A int B).
## Function to calculate overlap given coordinates matrix and dictionary of indicies.
def extract_profiles(global_data,target_ind_dict,threshold,P):
## estimate the bandwith
params = {'bandwidth': np.linspace(np.min(global_data), np.max(global_data),20)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
## perform MeanShift clustering.
combine= {}
for bull in target_ind_dict.keys():
grid.fit(global_data[target_ind_dict[bull],:])
combine[bull]= grid.best_estimator_
Stats= recursively_default_dict()
for combo in it.combinations(target_ind_dict.keys(),2):
pop1= combo[0]
pop2= combo[1]
All_coords= [x for x in it.chain(*[target_ind_dict[z] for z in combo])]
Quanted_set= global_data[All_coords,:]
i_coords, j_coords, z_coords = np.meshgrid(np.linspace(min(Quanted_set[:,0]),max(Quanted_set[:,0]),P),
np.linspace(min(Quanted_set[:,1]),max(Quanted_set[:,1]),P),
np.linspace(min(Quanted_set[:,2]),max(Quanted_set[:,2]),P), indexing= 'ij')
traces= [x for x in it.product(range(P),range(P),range(P))]
background= np.array([i_coords,j_coords,z_coords])
background= [background[:,c[0],c[1],c[2]] for c in traces]
background=np.array(background)
pop1_fist= combine[pop1].score_samples(background)
#pop1_fist= np.exp(pop1_fist)
P_dist_pop1= combine[pop1].score_samples(global_data[target_ind_dict[pop1],:])
pop1_fist = scipy.stats.norm(np.mean(P_dist_pop1),np.std(P_dist_pop1)).cdf(pop1_fist)
pop1_fist= [int(x >= threshold) for x in pop1_fist]
pop2_fist= combine[pop2].score_samples(background)
#pop2_fist= np.exp(pop2_fist)
P_dist_pop2= combine[pop2].score_samples(global_data[target_ind_dict[pop2],:])
pop2_fist = scipy.stats.norm(np.mean(P_dist_pop2),np.std(P_dist_pop2)).cdf(pop2_fist)
pop2_fist= [int(x >= threshold) for x in pop2_fist]
pop1_and_2= len([x for x in range(background.shape[0]) if pop1_fist[x] == 1 and pop2_fist[x] == 1])
pop1_I_pop2= pop1_and_2 / float(sum(pop1_fist))
pop2_I_pop1= pop1_and_2 / float(sum(pop2_fist))
empty_space= 1 - (sum(pop1_fist) + sum(pop2_fist) - pop1_and_2) / background.shape[0]
Stats[combo][pop1]= pop1_I_pop2
Stats[combo][pop2]= pop2_I_pop1
Stats[combo]['empty']= empty_space
return Stats
## Generating sets of coordinates given X.
def jumpingJack(x):
height= sin(x) * 5
mean_pop1= [-1,0,4]
mean_pop3= [0,10,height]
mean_pop2= [0,10,-height]
Pop1= np.random.multivariate_normal(mean_pop1, cov_pop1, Sizes[0])
Pop2= np.random.multivariate_normal(mean_pop2, cov_pop2, Sizes[1])
feature= np.vstack((Pop1,Pop2))
Pop3= np.random.multivariate_normal(mean_pop3, cov_pop3, Sizes[2])
data= np.vstack((feature,Pop3))
return data
# Varying 'X'. This can take a few minutes.
Windows= recursively_default_dict()
target_indx= {z:[x for x in range(len(label_vector)) if label_vector[x] == z] for z in list(set(label_vector))}
threshold= .005
P= 30
for window in np.arange(0.0, 30, .1):
data = jumpingJack(window)
profiles= extract_profiles(data,target_indx,threshold,P)
Windows[window]= profiles
# Plot overlaps
def plot_proximities(tup):
Xs= []
pop1= []
labels= []
for window in Windows.keys():
Xs.extend([window] * 3)
pop1.append(Windows[window][tuple((int(tup[0]),int(tup[1])))][int(tup[0])])
pop1.append(Windows[window][tuple((int(tup[0]),int(tup[1])))][int(tup[1])])
pop1.append(Windows[window][tuple((int(tup[0]),int(tup[1])))]['empty'])
labels.extend(['P2|1','P1|2','leftover space'])
coords= {i:[x for x in range(len(labels)) if labels[x] == i] for i in list(set(labels))}
datum= np.array([Xs,pop1]).T
fig_data= [go.Scatter(
x= datum[coords[i],0],
y= datum[coords[i],1],
mode= 'lines',
name= i
) for i in coords.keys()
]
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
),
title= 'Variation between',
yaxis=dict(
range=[0, 1]
)
)
fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)
interact(plot_proximities, tup=['01','02','12'])
Failed to display Jupyter Widget of type interactive
.
If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.
If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.
<function __main__.plot_proximities>