import scipy
import numpy as np
import pandas as pd
import itertools as it
from math import sin
import collections
def recursively_default_dict():
return collections.defaultdict(recursively_default_dict)
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import scale
from scipy.stats.stats import pearsonr
from scipy.stats import invgamma
from scipy.stats import beta
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff
from sklearn.metrics import silhouette_samples, silhouette_score
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
import Lab_modules.StructE_tools as Ste
from Lab_modules.Generate_freq_vectors import generate_vectors_Beta
This notebook adresses the problem of studying genetic distances across genomic windows.
Create a synthetic data set of consecutive genomic windows of haploid data. Several functions of are proposed to model genomic diversity. Population number and sampling size are user defined.
This section performs distance calculations across the data sets proovided. All calculations are performed in PCA feature space and focus on one selected focus population.
At each window two centroids are calculated: that of the samples from the focus populations and that of all the remainder. The distances of all samples to each centroid are calculated. The mean and standard deviation of non-target distances.
Explores different ways to visualize the data produced.
We will begin by generating many data sets, each bearing population samples presenting predetermined degrees of correlation.
The protocol used to produce these samples is explored in another notebook, Genome Structure Sim, available in the GitHub directory Tools_Shop
The process begins by the simulation of a frequency vectors of a given length, each representative of a single population. Through PCA, these vectors are then used to generate new vectors that respect a given prior of pairwise correlation.
Nbranches= 4
L= 150
n= 100
rangeA= [1,2.5]
rangeB = [.1,.6]
steps= 20
n_comp = 100
density= 50
vector_lib, background= generate_vectors_Beta(L,n,rangeA,rangeB,steps,n_comp)
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
features = pca.transform(vector_lib)
print(features.shape)
print(vector_lib.shape)
(2000, 100) (2000, 150)
### Functions to manipulate genetic structure.
def sin_prior(coords,target,vector2,angle,fst_max= 0.2,passport= False):
ID= 'sinusoid'
coords[target[0]] = coords[target[0]] - [(fst_max * 10 - 1) * sin(angle) * x for x in vector2]
if passport:
return coords,ID
else:
return coords
def linear_prior(coords,target,vector2,angle,region= [-5,5],slope= 1,passport= False):
ID= 'linear'
if angle >= region[0] and angle <= region[1]:
progression= abs(angle - region[0]) / (region[1] - region[0])
coords[target[0]] = coords[target[0]] + [progression * x * slope for x in vector2]
if passport:
return coords,ID
else:
return coords
def introgression_prior(coords,target,vector2,angle, region,passport= False):
ID= 'introgression'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[1]
if passport:
return coords,ID
else:
return coords
def alien_prior_I(coords,target,vector2,angle,fst= .2,region= [-5,5],passport= False):
ID= 'alien I'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[target[0]] + [(10 * fst - 1) * x for x in vector2]
#coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
#coords[target[0]] = coords[len(coords) - 1]
else:
coords[target[0]] = coords[target[1]]
if passport:
return coords,ID
else:
return coords
def alien_prior_II(coords,target,vector2,angle,fst= .2,region= [-5,5],passport= False):
ID= 'alien II'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[target[0]] + [(10 * fst - 1) * x for x in vector2]
#coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
#coords[target[0]] = coords[len(coords) - 1]
if passport:
return coords,ID
else:
return coords
def alien_prior_III(coords,target,vector2,angle,fst_a= 0.2,fst_b= .2,region= [-5,5],passport= False):
ID= 'alien III'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[target[0]] + [(10 * fst_b-1) * x for x in vector2]
#coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
#coords[target[0]] = coords[len(coords) - 1]
else:
coords[target[0]] = coords[target[0]] + [(10 * fst_a-1) * x for x in vector2]
if passport:
return coords,ID
else:
return coords
A first look at the correlation between simulated frequency vectors across data sets.
from Lab_modules.Generate_samples import Gen_samples, Check_Path, plot_GenFst
Sizes= [100,100,100,100]
Npops= len(Sizes)
range_diff= [-10,10]
prior_kwargs= {
'fst_a': .0,
'fst_b': 0.3,
'region': [-3,-1]
}
prior_func= alien_prior_III
fig, Pops, prior= Check_Path(Npops,vector_lib,prior_func,prior_kwargs,Pops= [],random= True,n_comp= L,range_diff= range_diff,steps= 100)
iplot(fig)
D:\GitHub\Genetic-data-analysis\Notebooks\Lab_modules\StructE_tools.py:419: RuntimeWarning: invalid value encountered in double_scalars
# (Pops,Sizes,vector_lib,return_pca= False,n_comp= 100,prior= 'sinusoid',range_diff= [-10,10],steps= 100)
SequenceStore, Fst_windows= Gen_samples(Pops,Sizes,vector_lib,prior_func,prior_kwargs,range_diff= range_diff,steps= 100)
...
D:\GitHub\Genetic-data-analysis\Notebooks\Lab_modules\StructE_tools.py:419: RuntimeWarning: invalid value encountered in double_scalars
Done.
We verify that the structure produced at each local window corresponds to the expected pattern.
from Lab_modules.Generate_samples import plot_GenFst
plot_GenFst(Fst_windows,Npops,1)
The goal is to study the isolation of a group of accessions relative to all reference samples across data sets.
The idea is simple:
Later we will study the distribution of distances to target centroid across data sets.
from IPython.display import clear_output
from Lab_modules.Distance_tools_VI import Distance_analysis
### Perform Distance and association analysis on the data sets generated
### Define reference and admixed associations:
### for the purpose of this analysis exploration will be driven by
### structure distance profiles. Since KDE will be used for exploration,
### a set of accessions can be analysed that do not contribute to
### distance profiles.
admx_N= 15
admx_indx= np.random.choice(range(sum(Sizes)),admx_N,replace= False)
ref_indx= [x for x in range(sum(Sizes)) if x not in admx_indx]
labels_pops= np.repeat(range(len(Sizes)),Sizes)
##
refs_lib= {x:[i for i in range(sum(Sizes)) if labels_pops[i] == x] for x in range(len(Sizes))}
admx_lib= {(max(refs_lib.keys()) + 1): admx_indx}
admx_lib.update(refs_lib)
Geneo= admx_lib
### Define parameters and libraries of analyses.
DIMr = 'PCA'
Bandwidth_split= 30
ncomp_local= 4
clsize= 15
target_group= 0
Distances, Clover, Ref_stats_lib, Ref_stats, center_distances, Coordinates, Clusters_coords= Distance_analysis(SequenceStore,target_group,refs_lib,DIMr = 'PCA',
Bandwidth_split= Bandwidth_split,
ncomp_local= ncomp_local,
clsize= clsize)
Ref_stats= np.array(Ref_stats)
Clover= np.array([x for x in Clover])
Coordinates= np.array(Coordinates)
Clusters_coords= np.array(Clusters_coords)
Distances= np.array(Distances)
center_distances= np.array([x for x in center_distances])
#center_distances= np.array([x.reshape(1,-1)[0] for x in center_distances])
from sklearn import preprocessing
### initial step of filtering.
### removing windows where variation between references resulted
### in normalized min and maximum distances above threshold.
###
Trim_threshold= 20
trim_indexes= [x for x in range(Clover.shape[0]) if \
min(Clover[x]) >= -Trim_threshold and max(Clover[x]) <= Trim_threshold and \
min(center_distances[x]) >= -Trim_threshold and max(center_distances[x]) <= Trim_threshold]
Ref_stats= np.array(Ref_stats[trim_indexes])
Distances= np.array(Distances[trim_indexes])
Clover= np.array(Clover[trim_indexes])
center_distances= np.array(center_distances[trim_indexes])
Clusters_coords= np.array(Clusters_coords[trim_indexes])
Reduce and cluster distance vectors. Estimate average distances across clusters produced.
The clustering algorithm here is Kmeans. The value of K is set manually. I suggest using the default and coming back here to reset according to the structure observed below.
Organise into data
library.
from Lab_modules.Distance_tools import reduc_clust_summ
variation_focus= range(len(Whose_parents))
### PCA
pca = PCA(n_components=5, whiten=False).fit(Clover.T)
X_se = pca.transform(Clover.T)
COMPS81 = pca.components_.T*np.sqrt(pca.explained_variance_)
###############################################################################
###############################################################################
###############################################################################
###### compare decompositions:
from sklearn.decomposition import PCA, FactorAnalysis, FastICA
dr_names= ['PCA','FA','FastICA']
Decoms= [PCA(n_components= 5),FactorAnalysis(n_components= 5)]
###
data= reduc_clust_summ(Clover,Distances,center_distances,Ref_stats,Decoms,dr_names,kclusters= 2)
print(data.keys())
dict_keys(['PCA', 'FA', 'Ref_stats', 'Distances', 'centre_dists'])
### Cluster proportions
### Select feature reduction:
Dr= 'PCA'
clusters_labels= data[Dr]['labels_l1']
def paint_bars(selected):
clusters= data[Dr]['features']
if selected == 0:
color_range = ['rgb(158,202,225)'] * len(list(set(clusters.iloc[:,0])))
else:
color_range = ['rgb(158,202,225)'] * len(list(set(clusters.iloc[:,0])))
color_range[selected - 1] = 'rgb(204,0,0)'
whom= sorted(list(set(clusters_labels)))
nb= [round(len([x for x in clusters_labels if x == y]) / float(len(clusters_labels)),3) for y in whom]
nc= [str(x + 1) for x in whom]
trace = [go.Bar(
x= nc,
y= nb,
text= nb,
marker=dict(
color= color_range,
line=dict(
color='rgb(8,48,107)',
width=1.5),
),
opacity= .6
)]
layout= go.Layout(
title= 'cluster proportions',
xaxis= dict(
title= 'MS clusters'
),
yaxis= dict(
title= '%'
)
)
fig= go.Figure(data=trace,layout=layout)
iplot(fig)
#interact(paint_bars,selected= [x for x in range(len(list(set(clusters_labels)))+1)])
paint_bars(3)
### plot clusters:
clusters_labels= data[Dr]['labels_l1']
clusters= data[Dr]['features']
def plot_clusters(selected):
scheme = [x for x in list(set(clusters_labels))]
if selected == 0:
opac_range = [1]*len(list(set(clusters_labels)))
else:
opac_range = [.3]*len(list(set(clusters_labels)))
opac_range[selected-1] = 1
fig_data= [go.Scatter(
y = clusters.iloc[[x for x in range(len(clusters_labels)) if clusters_labels[x] == i],:].pc1,
x = clusters.iloc[[x for x in range(len(clusters_labels)) if clusters_labels[x] == i],:].pc2,
mode= "markers",
marker= {
'line': {'width': 0},
'size': 6,
'symbol': 'circle',
'opacity': opac_range[i]
},
name = str(i + 1)
) for i in scheme]
layout = go.Layout(
title= 'Distance to references PCA',
xaxis= dict(
title= 'PC2'
),
yaxis= dict(
title= 'PC1'
),
showlegend= True
)
fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)
#interact(plot_clusters,selected= [x for x in range(len(clusters.label.unique())+1)])
plot_clusters(0)
Loadings correspond to haplotype samples.
df= pd.DataFrame(X_se,columns= None)
Centre_distances= data['centre_dists']
Ref_stats= data['Ref_stats']
ID= prior + ' sim'
### plot loadings:
vectors= data['PCA']['KDE']
def plot_accessions(selected_column):
layout= go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
),
xaxis= dict(
title= 'PC1',
),
showlegend= True
)
#names_index = [[f for f in orderCore.ID].index(x) for x in [str(y) for y in df.iloc[:,1]]]
opac= .8
if selected_column == 0:
scheme = labels_pops
coords = {y:[x for x in range(len(scheme)) if scheme[x] == y and x ] for y in list(set(scheme))}
#pop_refs= ["Indica","cAus","Japonica","GAP","cBasmati","Admix"]
#color_here= color_ref
fig= [go.Scatter(
x = df.iloc[coords[i],2],
y = df.iloc[coords[i],3],
mode= "markers",
marker= {
# 'color': scheme,
'color': i,
'line': {'width': 0},
'size': 6,
'symbol': 'circle',
"opacity": 1
},
name= str(i)
) for i in coords.keys() if i != 0]
if selected_column > 0:
fig= [go.Scatter(
x = df.iloc[:,2],
y = df.iloc[:,3],
mode= "markers",
marker= {
'color': vectors.iloc[:,selected_column - 1],
'colorbar': go.ColorBar(
title= 'ColorBar'
),
'colorscale': 'Viridis',
'line': {'width': 0},
'size': 6,
'symbol': 'circle',
"opacity": opac
}
)]
fig = go.Figure(data=fig,layout= layout)
iplot(fig)
#interact(plot_accessions,selected_column= [x for x in range(len(clusters.label.unique())+1)])
plot_accessions(0)
We now look at the distance of individual samples to target and reference centroids across data sets. We partition distance vectors according to the clusters studied above.
How variable are average distance measurements across vectors? by Group (plot) and individual (scatter).
from Lab_modules.distance_subplots import dens_plot, trace_plot
Ncols= 2
height= 400
width= 1000
data_groups= list(set(clusters_labels))
titles= ['CL: {}'.format(x) for x in data_groups]
dens_plot(data,data_groups,titles,Dr,Ncols= Ncols,width= width,height= height)
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (1,2) x2,y2 ] 1 2
### Z= 0: raw
### Z= 1: scaled by reference mean and sd.
Z= 1
Ncols= 2
height= 400
width= 1000
data_groups= list(set(clusters_labels))
titles= ['CL: {}'.format(x) for x in data_groups]
trace_plot(data,data_groups,labels_pops,titles,Z= Z,Ncols= Ncols,width= width,height= height)
This is the format of your plot grid: [ (1,1) x1,y1 ] [ (1,2) x2,y2 ] dict_keys([0, 1, 2, 3]) dict_keys([0, 1, 2, 3])
The plot above shows us the average distances to target and reference centroids for every accession counted. The x values should be significantly lower for accessions from the target group, assuming that these were chosen for high degree of correlation across data sets.
If other accessions also appear close to this group (presenting negative values if the Z
is set to 1 and values are scaled by windows), then we have evidence of genetic similarity at those windows covered by these measurements.
If the target group appears isolated across Kmeans groups. Then we can say that it is simply differentiated from the remainder.
However, if some plots isolate target accessions while others do not, then we have evidence of genetic exchange. Either from other material present into a differentiated target source, or from a differentiated population into a common background still present among other references.
If the structure prior chosen was that of jump of Fst between pop0 and the that population was then chosen as target, then this is a very rough example of what we might achieve given an introgression of wild material into a genetically divergent domesticated genepool. Of rice for example.
Below we study distribution of genetic distances by group in more depth.
..to be made prettier..
### Distribution of feature space distances between control populations for even and biased scenarios
from sklearn.neighbors import KernelDensity
select_l1= range(11)
selected1= [x for x in range(Distances.shape[0]) if data[Dr]['labels_l1'][x] + 1 in select_l1]
Means= Ref_stats[selected1,0]
Varsity= Ref_stats[selected1,1]
target_mean= Ref_stats[selected1,2]
target_var= Ref_stats[selected1,3]
Normalized_mean = [(target_mean[x] - Means[x]) / Varsity[x] for x in range(len(selected1))]
Normalized_mean= np.nan_to_num(Normalized_mean)
Normalized_mean[Normalized_mean > 20] = 10
Normalized_mean[Normalized_mean < -20] = -10
max_vec= [Centre_distances[x][[y for y in Distances[x]].index(min(Distances[x]))] for x in selected1]
max_vec= [(max_vec[x] - Means[x]) / Varsity[x] for x in range(len(selected1))]
max_vec= np.nan_to_num(max_vec)
max_vec[max_vec > 40] = 10
max_vec[max_vec < -40] = -10
max_dist= [max(Distances[x]) for x in selected1]
max_dist= [(max_dist[x] - Means[x]) / Varsity[x] for x in range(len(selected1))]
max_dist= np.nan_to_num(max_dist)
max_dist[max_dist > 40] = 10
max_dist[max_dist < -40] = -10
###
###
X_plot = np.linspace(min(Means) - 2, max(Means) + 2, 1000)
kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(np.array(Means).reshape(-1,1))
log_dens = kde.score_samples(X_plot.reshape(-1,1))
fig_roost_dens= [go.Scatter(x=X_plot, y=np.exp(log_dens),
mode='lines', fill='tozeroy', name= 'Ref mean',
line=dict(color='blue', width=2))]
## Change means and variances to those of selected clusters
##
X_plot = np.linspace(min(Normalized_mean) - 2, max(Normalized_mean) + 2, 1000)
kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(np.array(Normalized_mean).reshape(-1,1))
log_dens = kde.score_samples(X_plot.reshape(-1,1))
fig_roost_dens.append(go.Scatter(x=X_plot, y=np.exp(log_dens),
mode='lines', fill='tozeroy', name= 'Target mean',
line=dict(color='red', width=2)))
Dists_mean= np.mean(Normalized_mean)
Dists_std= np.std(Normalized_mean)
from scipy.stats import norm
Dists_endpoints= norm.interval(0.95, loc=Dists_mean, scale=Dists_std)
print(Dists_endpoints)
layout= go.Layout(
title= '{}, Ztarget and normal Dists.'.format(ID),
xaxis= dict(
range= [-5,20]
)
)
fig = go.Figure(data=fig_roost_dens, layout= layout)
iplot(fig)
(-3.8587207904733902, 4.68910999936198)
##
X_plot = np.linspace(min(Varsity) - 2, max(Varsity) + 2, 1000)
kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(np.array(Varsity).reshape(-1,1))
log_dens = kde.score_samples(X_plot.reshape(-1,1))
fig_vars_dens = [go.Scatter(x=X_plot, y=np.exp(log_dens),
mode='lines', fill='tozeroy', name= 'Ref SD',
line=dict(color='blue', width=2))]
##
X_plot = np.linspace(min(target_var) - 2, max(target_var) + 2, 1000)
kde = KernelDensity(kernel='gaussian', bandwidth=0.05).fit(np.array(target_var).reshape(-1,1))
log_dens = kde.score_samples(X_plot.reshape(-1,1))
fig_vars_dens.append(go.Scatter(x=X_plot, y=np.exp(log_dens),
mode='lines', fill='tozeroy', name= 'target SD',
line=dict(color='red', width=2)))
##
layout= go.Layout(
title= '{}, Variation across target and reference clusters'.format(ID)
)
fig = go.Figure(data=fig_vars_dens, layout= layout)
iplot(fig)
threshold= 2
coords_threshold= [int(x > threshold) for x in max_vec]
coords_threshold= {z:[x for x in range(len(coords_threshold)) if coords_threshold[x] == z] for z in list(set(coords_threshold))}
prop_thresh= round(len(coords_threshold[1]) / float(len(Normalized_mean)) * 100,2)
fig_data= [go.Scatter(
x= [clusters.pc1[x] for x in coords_threshold[i]],
#x= [max_vec[x] for x in coords_threshold[i]],
#y= [max_dist[x] for x in coords_threshold[i]],
y= [Normalized_mean[x] for x in coords_threshold[i]],
mode= 'markers',
name= '{} {}'.format(['<=','>'][i],threshold),
marker= dict(
color= ['grey','red'][i]
)
) for i in coords_threshold.keys()
]
layout = go.Layout(
title= '{}, {} % observations above threshold'.format(ID,prop_thresh),
yaxis=dict(
title='Distance',
range= [-10,40]),
xaxis=dict(
title='PC1')
)
fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)
coords_threshold.keys()
dict_keys([0, 1])
from Lab_modules.Generate_freq_vectors import generate_Branches_Beta
from Lab_modules.Euc_to_fst import Euc_to_fst
Nbranches= 4
L= 150
n= 100
rangeA= [1,2.5]
rangeB = [.1,.6]
steps= 20
n_comp = 100
density= 50
#vector_lib= generate_vectors_Beta(L,n,rangeA,rangeB,steps,n_comp)
#pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
#features = pca.transform(vector_lib)
features, vector_lib= generate_Branches_Beta(4,50,L,n,rangeA,rangeB,steps,n_comp)
print(features.shape)
print(vector_lib.shape)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-83-4246c5f0c0d9> in <module>() 1 from Lab_modules.Generate_freq_vectors import generate_Branches_Beta ----> 2 from Lab_modules.Euc_to_fst import Euc_to_fst 3 4 Nbranches= 4 5 L= 150 D:\GitHub\Genetic-data-analysis\Notebooks\Lab_modules\Euc_to_fst.py in <module>() 1 import numpy as np 2 import collections ----> 3 import StructE_tools as Ste 4 from scipy.stats import beta 5 from sklearn.decomposition import PCA ModuleNotFoundError: No module named 'StructE_tools'
m_coeff, b, fst_x, y_true= Euc_to_fst(vector_lib)
length haps: 150, N iterations: 20, range pops: 8
C:\Users\jgarcia\Desktop\Jupyter_stuff\Genetic-data-analysis\StructE_tools.py:398: RuntimeWarning: invalid value encountered in double_scalars
def return_refs(t1,t2,registered,Z):
Normalized_mean = [(target_mean[x] - Means[x]) / Varsity[x] for x in range(len(target_mean))]
max_vec= [Centre_distances[x][[y for y in Distances[x]].index(min(Distances[x]))] for x in range(len(Means))]
Normalized_vec= [(max_vec[x] - Means[x]) / Varsity[x] for x in range(len(Means))]
if registered == 'inlier':
t1= Dists_endpoints[0]
t2= Dists_endpoints[1]
#Normalized_mean= Normalized_vec
Normalized_mean= np.nan_to_num(Normalized_mean)
Normalized_mean[Normalized_mean > 20] = 10
Normalized_mean[Normalized_mean < -20] = -10
coords_threshold= [int(x >= t1 and x <= t2) for x in Normalized_mean]
coords_threshold= {z:[x for x in range(len(coords_threshold)) if coords_threshold[x] == z] for z in list(set(coords_threshold))}
#names_index = [[f for f in orderCore.ID].index(x) for x in [str(y) for y in df.iloc[:,1]]]
average_selected= round(np.mean([Normalized_mean[x] for x in coords_threshold[1] if Normalized_vec[x] > - 20 and Normalized_vec[x] < 20]),2)
prop_thresh= round(len(coords_threshold[1]) / float(len(Normalized_mean)) * 100,2)
selected1= coords_threshold[1]
###
scheme = labels_pops
coords = {y:[x for x in range(len(scheme)) if scheme[x] == y and x ] for y in list(set(scheme))}
print(coords.keys())
pop_refs= ["Indica","cAus","Japonica","GAP","cBasmati","Admix"]
#color_here= color_ref
###
ref_names= ['Indica','cAus','Japonica','cBasmati','Admix']
#scheme= [orderCore.loc[x,'sNMF_K3'] for x in names_index]
#scheme= {z:[x for x in range(len(scheme)) if scheme[x] == z] for z in list(set(scheme))}
if Z== 'raw':
X= np.mean(Distances[selected1,:],axis= 0)
print(len(scheme))
Y= np.mean(Centre_distances[selected1,:],axis= 0)
max_dists= [max(Centre_distances[x]) for x in selected1]
min_centre= [min(Distances[x]) for x in selected1]
Normalized_mean= np.mean(max_dists)
Normalized_centre= np.mean(min_centre)
if Z== 'scaled':
meansVars= Ref_stats[selected1,:]
trim= [x for x in range(meansVars.shape[0]) if meansVars[x,1] > 0.05]
meansVars= meansVars[trim,:]
select_trim= [selected1[x] for x in trim]
max_dists= [max(Centre_distances[x]) for x in select_trim]
min_centre= [min(Distances[x]) for x in select_trim]
average_selected= round(np.mean([Means[x] for x in select_trim if Normalized_vec[x] > - 20 and Normalized_vec[x] < 20]),2)
sel_d= Distances[select_trim,:]
sel_d= [(sel_d[x] - meansVars[x,0]) / meansVars[x,1] for x in range(len(select_trim))]
X= np.mean(sel_d,axis=0)
sel_refd=Centre_distances[select_trim,:]
sel_refd= [(sel_refd[x] - meansVars[x,0]) / meansVars[x,1] for x in range(len(select_trim))]
Y= np.mean(sel_refd,axis=0)
if Z== 'Fst':
X= np.mean(Distances[selected1,:],axis= 0)
print(len(scheme))
Y= np.mean(Centre_distances[selected1,:],axis= 0)
X= [np.exp(m_coeff*np.log(x) + b) for x in X]
Y= [np.exp(m_coeff*np.log(x) + b) for x in Y]
max_dists= [max(Centre_distances[x]) for x in selected1]
min_centre= [min(Distances[x]) for x in selected1]
trace1 = [go.Scatter(
visible = True,
x= [X[z] for z in coords[i]],
y= [Y[z] for z in coords[i]],
mode= 'markers',
name= str(i),
marker=dict(color= i, size=7, opacity=0.6)
) for i in coords.keys()]# if coords[i] and i != 5]
layout = go.Layout(
title= '{} distances; % {}; Tz. > {} < {} local sd; sel: {}'.format(ID,prop_thresh,round(t1,2),round(t2,2),average_selected),
showlegend=False,
autosize=True,
xaxis= dict(
range= [-6,15],
title= 'PCA euc distances'
),
yaxis= dict(
range= [-2,15],
title= 'PCA distances to center')
)
fig= go.Figure(data=trace1,layout= layout)
iplot(fig)
interact(return_refs,t1= range(-2,20),t2= range(2,20),Z= ['raw','scaled','Fst'],registered=['registered','inlier'])
Failed to display Jupyter Widget of type interactive
.
If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.
If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.
<function __main__.return_refs>
test_guy= 12
range_cdist= [np.percentile(Centre_distances,1),np.percentile(Centre_distances,99),400]
####
N= 400
P= 40
Distances_mean= np.mean(Distances[selected1],axis= 0)
range_distances= [np.percentile(Focus_dist,1),np.percentile(Focus_dist,99),400]
range_cdist= [np.percentile(Focus_center,1),np.percentile(Focus_center,99),400]
params = {'bandwidth': np.linspace(.05,.3,10)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
distances_dens= []
Cdist_dens= []
i_coords, j_coords = np.meshgrid(np.linspace(range_distances[0],range_distances[1],P),
np.linspace(range_cdist[0],range_cdist[1],P),indexing= 'ij')
traces= [x for x in it.product(range(P),range(P))]
params_unique = {'bandwidth': np.linspace(.1, .3,20)}
grid_unique = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params_unique,verbose=0)
params_dens= {'bandwidth': np.linspace(.4, .8,20)}
grid_dens = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params_dens,verbose=0)
traces= [x for x in it.product(range(P),range(P))]
background= np.array([i_coords, j_coords])
background= [background[:,c[0],c[1]] for c in traces]
background=np.array(background)
for karl in range(Distances.shape[0]):
"""
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(np.array(Distances[karl,:])[:,np.newaxis])
scores= kde.score_samples(np.linspace(*range_distances)[:,np.newaxis])
distances_dens.append(scores)
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(np.array(Centre_distances[karl,:])[:,np.newaxis])
scores= kde.score_samples(np.linspace(*range_cdist)[:,np.newaxis])
Cdist_dens.append(scores)
### Density measure
datum = np.array([[Distances[karl,x],Centre_distances[karl,x]] for x in range(Distances.shape[1])])
grid_dens.fit(datum)
kde = grid_dens.best_estimator_
P_dist= kde.score_samples(datum)
scores= kde.score_samples(background)
scores= np.exp(scores)
scores= np.array([x for x in scipy.stats.norm(np.mean(scores),np.std(scores)).cdf(scores)])
"""
### haplotypes measure
dotum= np.array([[Distances[karl,x],Centre_distances[karl,x]] for x in range(Distances.shape[1])])
datum= np.unique(dotum,axis= 0)
if len(datum) < 3:
datum= dotum
grid_unique.fit(datum)
kde = grid_unique.best_estimator_
P_dist= kde.score_samples(datum)
scores_haps= kde.score_samples(background)
scores_haps= np.exp(scores_haps)
#scores= scores_haps
scores= np.hstack((scores_haps))
#
Cdist_dens.append(scores)
#distances_dens= np.array(distances_dens)
Cdist_dens= np.array(Cdist_dens)
#coords= {i:[x for x in range(Distances.shape[0]) if data['labels_l1'][x] == i] for i in list(set(data['labels_l1']))}
### Cdist_dens must have been calculated.
N_view= 12
i_coords, j_coords = np.meshgrid(np.linspace(range_distances[0],range_distances[1],P),
np.linspace(range_cdist[0],range_cdist[1],P),indexing= 'ij')
traces= [x for x in it.product(range(P),range(P))]
background= np.array([i_coords, j_coords])
background= [background[:,c[0],c[1]] for c in traces]
background=np.array(background)
Xs= []
Ys= []
Zs= []
scores= []
for target in range(N_view):
Xs.extend(background[:,0])
Ys.extend(background[:,1])
scores.extend(Cdist_dens[selected1[target],:P**2])
Zs.extend([target] * background.shape[0])
thresho= .005
tie= [x for x in range(len((scores))) if scores[x] < thresho]
win= np.array([Xs,Ys,Zs,scores]).T
unmasked= win[[x for x in range(win.shape[0]) if x not in tie],:]
fig= [go.Scatter3d(
x= unmasked[:,0],
y= unmasked[:,1],
z= unmasked[:,2],
mode= 'markers',
marker= {
'color':unmasked[:,3],
'colorbar': go.ColorBar(
title= 'ColorBar'
),
'colorscale':'Viridis',
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": .8
}
)]
fig = go.Figure(data=fig)
iplot(fig)
params_unique = {'bandwidth': np.linspace(0, .2,20)}
grid_unique = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params_unique,verbose=0)
params_dens= {'bandwidth': np.linspace(.2, .6,20)}
grid_dens = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params_dens,verbose=0)
traces= [x for x in it.product(range(P),range(P))]
i_coords, j_coords = np.meshgrid(np.linspace(range_distances[0],range_distances[1],P),
np.linspace(range_cdist[0],range_cdist[1],P),indexing= 'ij')
background= np.array([i_coords, j_coords])
background= [background[:,c[0],c[1]] for c in traces]
background=np.array(background)
###
datum= np.array([[Distances[test_guy,x],Centre_distances[test_guy,x]] for x in range(Distances.shape[1])])
dotum= np.array([[Distances[test_guy,x],Centre_distances[test_guy,x]] for x in range(Distances.shape[1])])
### Density measure
grid_dens.fit(datum)
kde = grid_dens.best_estimator_
P_dist= kde.score_samples(datum)
scores= kde.score_samples(background)
scores= np.exp(scores)
scores= np.array([x for x in scipy.stats.norm(np.mean(scores),np.std(scores)).cdf(scores)])
### haplotypes measure
datum= np.unique(dotum,axis= 0)
grid_unique.fit(datum)
kde = grid_unique.best_estimator_
P_dist= kde.score_samples(datum)
scores_haps= kde.score_samples(background)
scores_haps= np.exp(scores_haps)
scores_combine= scores_haps
fig= [go.Scatter3d(
x= background[:,0],
y= background[:,1],
# z= scores[[x for x in range(len(scores)) if scores[x] > 0]],
z= scores_combine,
mode= 'markers',
marker= {
'color':scores_combine,
'colorbar': go.ColorBar(
title= 'ColorBar'
),
'colorscale':'Viridis',
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": .8
}
)]
fig = go.Figure(data=fig)
iplot(fig)