import scipy
import numpy as np
import pandas as pd
import itertools as it
from math import sin
import collections
def recursively_default_dict():
return collections.defaultdict(recursively_default_dict)
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import scale
from scipy.stats.stats import pearsonr
from scipy.stats import invgamma
from scipy.stats import beta
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff
from sklearn.metrics import silhouette_samples, silhouette_score
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)
import Lab_modules.StructE_tools as Ste
from Lab_modules.Generate_freq_vectors import generate_vectors_Beta
Nbranches= 4
L= 150
n= 100
rangeA= [1,2.5]
rangeB = [.1,.6]
steps= 20
n_comp = 100
density= 50
vector_lib= generate_vectors_Beta(L,n,rangeA,rangeB,steps,n_comp)
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
features = pca.transform(vector_lib)
#features, vector_lib= generate_Branches_Beta(4,50,L,n,rangeA,rangeB,steps,n_comp)
print(features.shape)
print(vector_lib.shape)
(2000, 100) (2000, 150)
### Functions to manipulate genetic structure.
def sin_prior(coords,target,vector2,angle,fst_max= 0.2,passport= False):
ID= 'sinusoid'
coords[target[0]] = coords[target[0]] - [(fst_max * 10 - 1) * sin(angle) * x for x in vector2]
if passport:
return coords,ID
else:
return coords
def linear_prior(coords,target,vector2,angle,region= [-5,5],slope= 1,passport= False):
ID= 'linear'
if angle >= region[0] and angle <= region[1]:
progression= abs(angle - region[0]) / (region[1] - region[0])
coords[target[0]] = coords[target[0]] + [progression * x * slope for x in vector2]
if passport:
return coords,ID
else:
return coords
def introgression_prior(coords,target,vector2,angle, region,passport= False):
ID= 'introgression'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[1]
if passport:
return coords,ID
else:
return coords
def alien_prior_I(coords,target,vector2,angle,fst= .2,region= [-5,5],passport= False):
ID= 'alien I'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[target[0]] + [(10 * fst - 1) * x for x in vector2]
#coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
#coords[target[0]] = coords[len(coords) - 1]
else:
coords[target[0]] = coords[target[1]]
if passport:
return coords,ID
else:
return coords
def alien_prior_II(coords,target,vector2,angle,fst= .2,region= [-5,5],passport= False):
ID= 'alien II'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[target[0]] + [(10 * fst - 1) * x for x in vector2]
#coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
#coords[target[0]] = coords[len(coords) - 1]
if passport:
return coords,ID
else:
return coords
def alien_prior_III(coords,target,vector2,angle,fst_a= 0.2,fst_b= .2,region= [-5,5],passport= False):
ID= 'alien III'
if angle >= region[0] and angle <= region[1]:
coords[target[0]] = coords[target[0]] + [(10 * fst_b-1) * x for x in vector2]
#coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
#coords[target[0]] = coords[len(coords) - 1]
else:
coords[target[0]] = coords[target[0]] + [(10 * fst_a-1) * x for x in vector2]
if passport:
return coords,ID
else:
return coords
from Generate_samples import Gen_samples, Check_Path, plot_GenFst
Sizes= [100,100,100,100]
Npops= len(Sizes)
range_diff= [-10,10]
prior_kwargs= {
'fst_a': .0,
'fst_b': 0.3,
'region': [-3,-1]
}
prior_func= alien_prior_III
fig, Pops, prior= Check_Path(Npops,vector_lib,prior_func,prior_kwargs,Pops= [],random= True,n_comp= L,range_diff= range_diff,steps= 100)
iplot(fig)
C:\Users\jgarcia\Desktop\Jupyter_stuff\Genetic-data-analysis\StructE_tools.py:398: RuntimeWarning: invalid value encountered in double_scalars
# (Pops,Sizes,vector_lib,return_pca= False,n_comp= 100,prior= 'sinusoid',range_diff= [-10,10],steps= 100)
SequenceStore, Fst_windows= Gen_samples(Pops,Sizes,vector_lib,prior_func,prior_kwargs,range_diff= range_diff,steps= 100)
...
C:\Users\jgarcia\Desktop\Jupyter_stuff\Genetic-data-analysis\StructE_tools.py:398: RuntimeWarning: invalid value encountered in double_scalars
Done.
### Check that we got what we wanted.
plot_GenFst(Fst_windows,Npops,1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-6-c592d4660d6b> in <module>() 1 ### Check that we got what we wanted. 2 ----> 3 plot_GenFst(Fst_windows,Npops,1) ~\Desktop\Jupyter_stuff\Genetic-data-analysis\Generate_samples.py in plot_GenFst(Fst_lib, Npops, Chr) 176 mode= 'markers', 177 name= '{}'.format([x for x in it.combinations(range(Npops),2)][i]) --> 178 ) for i in range(len([x for x in it.combinations(range(Npops),2)])) 179 ] 180 ~\Desktop\Jupyter_stuff\Genetic-data-analysis\Generate_samples.py in <listcomp>(.0) 176 mode= 'markers', 177 name= '{}'.format([x for x in it.combinations(range(Npops),2)][i]) --> 178 ) for i in range(len([x for x in it.combinations(range(Npops),2)])) 179 ] 180 NameError: name 'Fst_windows' is not defined
### Perform Distance and association analysis on the data sets generated
### Define reference and admixed associations:
### for the purpose of this analysis exploration will be driven by
### structure distance profiles. Since KDE will be used for exploration,
### a set of accessions can be analysed that do not contribute to
### distance profiles.
admx_N= 50
admx_indx= np.random.choice(range(sum(Sizes)),admx_N,replace= False)
ref_indx= [x for x in range(sum(Sizes)) if x not in admx_indx]
labels_pops= np.repeat(range(len(Sizes)),Sizes)
##
refs_lib= {x:[i for i in range(sum(Sizes)) if labels_pops[i] == x] for x in range(len(Sizes))}
admx_lib= {(max(refs_lib.keys()) + 1): admx_indx}
admx_lib.update(refs_lib)
Geneo= admx_lib
### Define parameters and libraries of analyses.
DIMr = 'PCA'
Bandwidth_split= 30
ncomp_local= 4
clsize= 15
Results = {x:recursively_default_dict() for x in SequenceStore.keys()}
Clover= []
Coordinates= []
Clusters_coords= []
Dist_vectors= []
Struct_dist_vectors= recursively_default_dict()
Construct = recursively_default_dict()
Distances_vectors= recursively_default_dict()
PC_var= recursively_default_dict()
center_distances= []
Ref_stats= []
for CHR in SequenceStore.keys():
print('going on CHR: '+ str(CHR))
for c in SequenceStore[CHR].keys():
print('data set: {}'.format(c))
### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py.
Sequences = SequenceStore[CHR][c]
Sequences= np.nan_to_num(Sequences)
print(Sequences.shape)
#### Dimensionality reduction
if DIMr == 'PCA':
pca = PCA(n_components=ncomp_local, whiten=False,svd_solver='randomized').fit(Sequences)
data = pca.transform(Sequences)
PC_var[CHR][c]= [x for x in pca.explained_variance_]
if DIMr == 'NMF':
from sklearn.decomposition import NMF
data = NMF(n_components=ncomp_local, init='random', random_state=0).fit_transform(Sequences)
Accurate = []
params = {'bandwidth': np.linspace(np.min(data), np.max(data),Bandwidth_split)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
######################################
####### TEST global Likelihood #######
######################################
Focus_labels = [z for z in it.chain(*refs_lib.values())]
#### Mean Shift approach
## from sklearn.cluster import MeanShift, estimate_bandwidth
bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(Focus_labels))
if bandwidth <= 1e-3:
bandwidth = 0.1
ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=15)
ms.fit(data[Focus_labels,:])
labels = ms.labels_
Tree = {x:[Focus_labels[y] for y in range(len(labels)) if labels[y] == x] for x in [g for g in list(set(labels)) if g != -1]}
Keep= [x for x in Tree.keys() if len(Tree[x]) > clsize]
Tree= {x:Tree[x] for x in Keep}
print('N clusters: {}, Sizes: {}'.format(len(Tree),[len(x) for x in Tree.values()]))
SpaceX = {x:data[Tree[x],:] for x in Tree.keys()}
### Extract MScluster likelihood by sample
for hill in SpaceX.keys():
grid.fit(data[Tree[hill],:])
# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_
P_dist = kde.score_samples(data[Tree[hill],:])
Dist = kde.score_samples(data)
P_dist= np.nan_to_num(P_dist)
Dist= np.nan_to_num(Dist)
if np.std(P_dist) == 0:
Dist= np.array([int(Dist[x] in P_dist) for x in range(len(Dist))])
else:
Dist = scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(Dist)
Dist= np.nan_to_num(Dist)
Construct[CHR][c][hill] = Dist
#########################################
############# AMOVA ################
#########################################
Who = [x for x in range(len(labels)) if labels[x] != -1 and labels[x] in Keep]
labels = [labels[x] for x in Who]
Who= [Focus_labels[x] for x in Who]
AMOVA,Cig = Ste.findPhiPT(Sequences[Who,:],labels,n_boot=0)
########################################################################
############## Distances ###############################################
########################################################################
Dist_vectors= Ste.Distance_profiles(data,Tree,1000,Bandwidth_split)
if Dist_vectors:
Distances_vectors[CHR][c]= Dist_vectors
Structures= Ste.Structure_profiles(data,Tree,1000,Bandwidth_split)
if Structures:
Struct_dist_vectors[CHR][c]= Structures
################ Other stats
SIL = silhouette_score(data[Who,:],labels)
df_between = len(Tree) - 1
df_within = sum([len(Tree[x]) for x in Tree.keys()]) - len(Tree)
df_total = sum([len(Tree[x]) for x in Tree.keys()]) - 1
Total = data[Who,:]
mu = np.mean(Total)
sd = np.std(Total)
Total = (Total - mu) / sd
T = np.sqrt(((Total - Total.mean(axis=0))**2).sum(axis=1)).sum()**2 / len(Total)
Y = (np.sqrt(((Total - Total.mean(axis=0))**2).sum(axis = 1))**2).sum()
SS_between = -T
SS_within = Y
t_s = np.empty([len(Total),0])
for gp in Tree.keys():
Gp = Total[[x for x in range(Total.shape[0]) if labels[x] == gp],:]
Gp = (Gp - mu) / sd
SS_between += (np.sqrt(((Gp - Total.mean(axis=0))**2).sum(axis = 1)).sum()**2 / len(Gp))
SS_within -= (np.sqrt(((Gp - Total.mean(axis=0))**2).sum(axis = 1)**2).sum() / len(Gp))
t_s = np.append(t_s,np.sqrt(((Gp - Gp.mean(axis=0))**2)).sum(axis = 1))
SS_total = Y - T
MS_between = SS_between / df_between
MS_within = SS_within / df_within
sqrt_center_dists= np.sqrt(((Total - Total.mean(axis=0))**2)).sum(axis = 1)
t_stat,p_val = scipy.stats.ttest_rel(t_s,sqrt_center_dists)
F = MS_between / float(MS_within)
N_PC= len(Tree)
eta_sqrd = SS_between / SS_total
om_sqrd = (SS_between - (df_between * MS_within))/(SS_total + MS_within)
Results[CHR][c] = [N_PC,F,om_sqrd,MS_between,t_stat,AMOVA,SIL,len(Tree),]
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-73ed7514efd0> in <module>() 9 10 admx_N= 50 ---> 11 admx_indx= np.random.choice(range(sum(Sizes)),admx_N,replace= False) 12 13 ref_indx= [x for x in range(sum(Sizes)) if x not in admx_indx] NameError: name 'Sizes' is not defined
Fst_pairs= [i for i in it.combinations(range(len(Sizes)),2)]
Fst_lib= []
for pair in range(len(Fst_pairs)):
Fst_crawl= [[Fst_windows[Chr][n].fst[pair] for n in Fst_windows[Chr].keys()] for Chr in Fst_windows.keys()]
Fst_crawl= [z for z in it.chain(*Fst_crawl)]
Fst_lib.append(Fst_crawl)
Other_stats= [[[Chr,wind,*Results[Chr][wind]] for wind in Results[Chr].keys()] for Chr in Results.keys()]
Other_stats= np.array([y for y in it.chain(*Other_stats)])
print(Other_stats.shape)
# columns [CHR,window,N_PC,F,om_sqrd,MS_between,t_stat,AMOVA,SIL]
fig_data= [go.Scatter(
x= Fst_lib[i],
y= Other_stats[:,7],
mode= 'markers',
name= str(Fst_pairs[i])
) for i in range(len(Fst_pairs))]
layout = go.Layout(
title= 'Stats',
yaxis=dict(
title='AMOVA'),
xaxis=dict(
title='Manipulated Fst')
)
fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)
(40, 9)
fig_data= [go.Scatter(
x= Other_stats[:,8],
y= Other_stats[:,7],
mode= 'markers',
name= 'Results'
)]
layout = go.Layout(
title= 'Stats',
yaxis=dict(
title='AMOVA'),
xaxis=dict(
title='SIL')
)
fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)
#####
KDE_stats= [[[Construct[Chr][wind][gp] for gp in sorted(Construct[Chr][wind].keys())] for wind in sorted(Construct[Chr].keys())] for Chr in Construct.keys()]
KDE_stats= np.array([y for y in it.chain(*[z for z in it.chain(*KDE_stats)])])
KDE_stats= np.array(KDE_stats)
n_comp = 4
pca1 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
likelihood_feats = pca1.fit_transform(KDE_stats)
var_comps= pca1.explained_variance_ratio_
print("; ".join(['PC{0}: {1}'.format(x+1,round(pca1.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(likelihood_feats.shape))
## perform MeanShift clustering.
iu1 = np.triu_indices(KDE_stats.shape[0])
bandwidth = estimate_bandwidth(likelihood_feats, quantile=0.20)
params = {'bandwidth': np.linspace(np.min(likelihood_feats), np.max(likelihood_feats),30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=15)
ms.fit(likelihood_feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}
MS_centroids= [np.mean(likelihood_feats[label_select[z],:],axis= 0) for z in label_select.keys()]
MS_pair_dist= pairwise_distances(MS_centroids,metric= 'euclidean')
#MS_pair_dist= MS_pair_dist[iu_control]
fig_data= [go.Scatter3d(
x = likelihood_feats[label_select[i],0],
y = likelihood_feats[label_select[i],1],
z = likelihood_feats[label_select[i],2],
type='scatter3d',
mode= "markers",
name= 'Cl: {}'.format(i),
text= ['gp: {}'.format(i) for x in label_select[i]],
marker= {
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": 1
}
) for i in label_select.keys()]
layout = go.Layout(
title= 'PCA red + MS clustering KDE profiles across windows',
scene= Scene(
yaxis=dict(
title='{}'.format(round(var_comps[1],3))),
xaxis=dict(
title= '{}'.format(round(var_comps[0],3))),
zaxis=dict(
title= '{}'.format(round(var_comps[2],3))))
)
fig = go.Figure(data=fig_data,layout= layout)
iplot(fig)
PC1: 0.3; PC2: 0.24; PC3: 0.017; PC4: 0.013 features shape: (110, 4)
### Converting distances to fst's.
### Select pre and post processing measures.
Eigen = False
Scale= False
Center= True
MixL= False # select if to mix N_Pops or not.
Length_increment= False
L_step= 5
MixP= True # select if to mix lengths or not.
Pairs= False # select if comparing Pairs of distances or the distances themselves
Control_inc= True
Predict= False
length_haps= vector_lib.shape[1]
length_range= [75,vector_lib.shape[1]]
length_step= 10
pop_max= 8 # Number of pops
n_comp= 10 # components to keep following PCA
Iter= 20 # repeats
N_sims= 100 # number of haplotypes to generate from each pop in the unbiased scenario.
#### Predict
predicted= []
#def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs):
lengths_vector= []
### Control set to include in the transformation:
control_vecs= np.random.choice(range(vector_lib.shape[0]),2)
control_labels= np.repeat([0,1],N_sims)
### Control set distances
control_even_distances= []
control_bias_distances= []
### store distances between centroids
biased_pairwise= []
unbiased_pairwise= []
corrected_pairwise= []
### store PC projection:
dist_PC_even= {x:[] for x in range(n_comp)}
dist_PC_bias= {x:[] for x in range(n_comp)}
dist_PC_corrected= {x:[] for x in range(n_comp)}
### store increemental PC distances
dist_increment_even= {x:[] for x in range(1,n_comp)}
dist_increment_bias= {x:[] for x in range(1,n_comp)}
### store fsts
fst_store= []
### proceed.
for rep in range(Iter):
if MixP:
N_pops= np.random.choice(range(3,pop_max),1,replace= False)[0]
else:
N_pops= pop_max
if MixL:
length_haps= np.random.choice(length_range,1)[0]
if Length_increment:
length_haps= int(length_range[0] + L_step * np.floor(rep / L_step))
## Population Sizes and labels
bias_scheme= np.random.choice(range(25,200),N_pops,replace= False)
unbiased_sheme= np.repeat(N_sims,N_pops)
bias_labels= np.repeat(np.array([x for x in range(N_pops)]),bias_scheme)
unbias_labels= np.repeat(np.array([x for x in range(N_pops)]),unbiased_sheme)
### triangular matrices extract.
iu1= np.triu_indices(N_pops,1) # for centroid comparison
iu_unbias= np.triu_indices(sum(unbiased_sheme),1)
iu_bias= np.triu_indices(sum(bias_scheme),1)
iu_control= np.triu_indices(2,1)
Pops= np.random.choice(vector_lib.shape[0],N_pops,replace= False)
print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps))
########## FST
freqs_selected= vector_lib[Pops,:length_haps]
Pairwise= Ste.return_fsts2(freqs_selected)
#fsts_compare = scale(Pairwise.fst)
fsts_compare= Pairwise.fst
if Pairs:
t= fsts_compare
fsts_compare= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
fst_store.extend(fsts_compare)
## lengths
lengths_vector.extend([length_haps] * len(fsts_compare))
#########################################################
########### PCA ####################################
#########################################################
### control sample
control_data= []
for k in range(2):
probs= vector_lib[control_vecs[k],:length_haps]
probs[probs < 0] = 0
probs[probs > 1] = 1
m= unbiased_sheme[k]
Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]
control_data.extend(Haps)
control_data= np.array(control_data)
#### generate data and perform PCA.
data= []
for k in range(N_pops):
probs= vector_lib[Pops[k],:length_haps]
probs[probs < 0] = 0
probs[probs > 1] = 1
m= unbiased_sheme[k]
Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]
data.extend(Haps)
data1= np.array(data)
if Scale:
data1= scale(data1)
if Control_inc:
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data1]))
control_unbias_feat= pca.transform(control_data)
else:
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data1)
feat_unbias= pca.transform(data1)
if Eigen:
feat_unbias= feat_unbias * pca.explained_variance_ratio_
####### centroid comparison
#### Controls
if Control_inc:
control_centroids= [np.mean(control_unbias_feat[[y for y in range(control_unbias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
control_centroids= np.array(control_centroids)
unbias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
unbias_control_dist= unbias_control_dist[iu_control]
control_even_distances.extend(unbias_control_dist)
####
unbias_centroids= [np.mean(feat_unbias[[y for y in range(feat_unbias.shape[0]) if unbias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
unbias_centroids= np.array(unbias_centroids)
unbias_pair_dist= pairwise_distances(unbias_centroids,metric= 'euclidean')
unbias_pair_dist= unbias_pair_dist[iu1]
if Predict:
fst_pred= [np.exp(m_coeff*np.log(x) + b) for x in unbias_pair_dist]
predicted.extend(fst_pred)
#print(np.array([fst_pred,fsts_compare]).T)
#unbias_pair_dist= scale(unbias_pair_dist)
unbiased_pairwise.extend(unbias_pair_dist)
#################################################
############## biased sample
#### generate data and perform PCA
data= []
for k in range(N_pops):
probs= vector_lib[Pops[k],:]
m= bias_scheme[k]
Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]
data.extend(Haps)
data2= np.array(data)
if Scale:
data2= scale(data2)
if Control_inc:
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data2]))
control_bias_feat= pca.transform(control_data)
else:
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data2)
feat_bias= pca.transform(data2)
if Eigen:
feat_bias= feat_bias * pca.explained_variance_ratio_
#### Centroid distances
#### Controls
if Control_inc:
control_centroids= [np.mean(control_bias_feat[[y for y in range(control_bias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
control_centroids= np.array(control_centroids)
bias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
bias_control_dist= bias_control_dist[iu_control]
control_bias_distances.extend(bias_control_dist)
bias_centroids= [np.mean(feat_bias[[y for y in range(feat_bias.shape[0]) if bias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
bias_centroids= np.array(bias_centroids)
bias_pair_dist= pairwise_distances(bias_centroids,metric= 'euclidean')
bias_pair_dist= bias_pair_dist[iu1]
#bias_pair_dist= scale(bias_pair_dist)
if Pairs:
t= bias_pair_dist
bias_pair_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
biased_pairwise.extend(bias_pair_dist)
t= np.array([
fsts_compare,
unbias_pair_dist,
bias_pair_dist
]).T
Size= length_haps
fst_lm_range= [.02,.3]
Lindexes= [x for x in range(len(lengths_vector)) if lengths_vector[x] == Size and fst_store[x] >= fst_lm_range[0] and fst_store[x] <= fst_lm_range[1]]
y_true= [np.log(biased_pairwise[x]) for x in Lindexes]
m_coeff,b= np.polyfit(y_true,[np.log(fst_store[x]) for x in Lindexes],1)
Iter: 0, vectors selected: [ 80 72 108 94], hap length: 150
C:\Users\jgarcia\Desktop\Jupyter_stuff\Distance_stats\StructE_tools.py:398: RuntimeWarning: invalid value encountered in double_scalars
Iter: 1, vectors selected: [159 138 146 6 158 118], hap length: 150 Iter: 2, vectors selected: [132 134 42 72], hap length: 150 Iter: 3, vectors selected: [ 26 72 181 93 177 165], hap length: 150 Iter: 4, vectors selected: [ 71 39 120], hap length: 150 Iter: 5, vectors selected: [ 22 116 123 184 111], hap length: 150 Iter: 6, vectors selected: [ 84 90 172 78 137], hap length: 150 Iter: 7, vectors selected: [ 46 118 23 19 27 127], hap length: 150 Iter: 8, vectors selected: [194 143 170 13 17 0 14], hap length: 150 Iter: 9, vectors selected: [168 120 177 8], hap length: 150 Iter: 10, vectors selected: [189 14 73 31 85 123 33], hap length: 150 Iter: 11, vectors selected: [ 88 123 83 129 142], hap length: 150 Iter: 12, vectors selected: [ 57 74 171 39 93], hap length: 150 Iter: 13, vectors selected: [ 45 94 167 107 199 142 143], hap length: 150 Iter: 14, vectors selected: [ 94 77 104], hap length: 150 Iter: 15, vectors selected: [163 19 172], hap length: 150 Iter: 16, vectors selected: [178 164 76 81], hap length: 150 Iter: 17, vectors selected: [133 15 73], hap length: 150 Iter: 18, vectors selected: [130 65 101 105], hap length: 150 Iter: 19, vectors selected: [ 32 74 85 140 93], hap length: 150
Structure_vertice= [2]
Combis= []
for chromo in Struct_dist_vectors.keys():
for winds in Struct_dist_vectors[chromo].keys():
for grp in Struct_dist_vectors[chromo][winds].keys():
if len(grp) in Structure_vertice:
Combis.append([chromo,winds,grp])
#Struct_distance_stats= np.array([y for y in it.chain(*Struct_distance_stats)])
Struct_distance_stats= [[[Struct_dist_vectors[Chr][wind][gp] for gp in sorted(Struct_dist_vectors[Chr][wind].keys()) if len(gp) in Structure_vertice] for wind in sorted(Struct_dist_vectors[Chr].keys())] for Chr in Struct_dist_vectors.keys()]
Struct_distance_stats= np.array([y for y in it.chain(*[z for z in it.chain(*Struct_distance_stats)])])
Struct_distance_stats= np.array(Struct_distance_stats)
n_comp = 4
pcaS = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Struct_dist_feats = pcaS.fit_transform(Struct_distance_stats)
var_compsQ= pcaS.explained_variance_ratio_
print("; ".join(['PC{0}: {1}'.format(x+1,round(pcaS.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(Struct_dist_feats.shape))
bandwidth = estimate_bandwidth(Struct_dist_feats, quantile=0.2)
params = {'bandwidth': np.linspace(np.min(Struct_dist_feats), np.max(Struct_dist_feats),30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=25)
ms.fit(Struct_dist_feats)
labels1 = ms.labels_
#labels1= [int(1 in x[2]) for x in Combis]
#labels1= [len(x[2]) for x in Combis]
#print(Combis[:10])
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}
#### Let's plot the first 3 coordinates nonetheless.
####
fig_data= [go.Scatter3d(
x = Struct_dist_feats[label_select[i],0],
y = Struct_dist_feats[label_select[i],1],
z = Struct_dist_feats[label_select[i],2],
type='scatter3d',
mode= "markers",
name= 'Cl: {}'.format(i),
text= ['gp: {}'.format(i) for x in label_select[i]],
marker= {
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": 1
}
) for i in label_select.keys()]
layout = go.Layout(
title= 'PCA red + MS clustering PCA dist profiles across windows',
scene= Scene(
yaxis=dict(
title='{}'.format(round(var_compsQ[1],3))),
xaxis=dict(
title= '{}'.format(round(var_compsQ[0],3))),
zaxis=dict(
title= '{}'.format(round(var_compsQ[2],3))))
)
fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)
PC1: 0.894; PC2: 0.06; PC3: 0.014; PC4: 0.011 features shape: (100, 4)
def plot_density(selected_group):
Distances= Struct_distance_stats[label_select[selected_group],:]
print(Distances.shape)
sum_profile= np.mean(Distances,axis=0)
fig_data= [go.Scatter(
x= [np.exp(m_coeff * np.log(x) + b) for x in np.linspace(0,8,Distances.shape[1])],
y= sum_profile,
mode= 'markers',
name= 'Results'
)]
layout = go.Layout(
title= 'Stats',
yaxis=dict(
title='density'),
xaxis=dict(
title= 'Fst')
)
fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)
interact(plot_density,selected_group= label_select.keys())
Failed to display Jupyter Widget of type interactive
.
If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.
If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.
<function __main__.plot_density>
Test= Struct_distance_stats.T
Novel= KDE_stats.T
n_comp = 4
pca1 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
likelihood_feats = pca1.fit_transform(Novel)
var_comps= pca1.explained_variance_ratio_
print("; ".join(['PC{0}: {1}'.format(x+1,round(pca1.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(likelihood_feats.shape))
def kde_distances(gp):
if gp == -1:
apparel= Geneo
fig_data= [go.Scatter3d(
x = likelihood_feats[apparel[i],0],
y = likelihood_feats[apparel[i],1],
z = likelihood_feats[apparel[i],2],
type='scatter3d',
mode= "markers",
name= 'Cl: {}'.format(i),
text= ['gp: {}'.format(i) for x in apparel[i]],
marker= {
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": 1
}
) for i in apparel.keys()]
else:
sought_after= label_select[gp]
print(len(sought_after))
sought_after= [Combis[x] for x in sought_after]
#print(sought_after[:4])
Likes= [z for z in it.chain(*[[Construct[el[0]][el[1]][y] for y in el[2]] for el in sought_after])]
Likes= np.array([x for x in Likes if len(x) == sum(Sizes)])
print(Likes.shape)
Likes= np.mean(Likes, axis= 0).T
print(Likes.shape)
fig_data= [go.Scatter3d(
x = likelihood_feats[:,0],
y = likelihood_feats[:,1],
z = likelihood_feats[:,2],
type='scatter3d',
mode= "markers",
marker= {
'color': Likes,
'colorbar': go.ColorBar(
title= 'ColorBar'
),
'colorscale': 'Viridis',
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": 1
}
)]
layout = go.Layout(
title= 'PCA red + MS clustering KDE profiles across windows',
scene= Scene(
yaxis=dict(
title='{}'.format(round(var_comps[1],3))),
xaxis=dict(
title= '{}'.format(round(var_comps[0],3))),
zaxis=dict(
title= '{}'.format(round(var_comps[2],3))))
)
fig = go.Figure(data=fig_data,layout= layout)
iplot(fig)
interact(kde_distances,gp= [-1,*[x for x in label_select.keys()]])
PC1: 0.299; PC2: 0.242; PC3: 0.013; PC4: 0.012 features shape: (300, 4)
Failed to display Jupyter Widget of type interactive
.
If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.
If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.
<function __main__.kde_distances>
def plot_density(selected_group):
sought_after= label_select[selected_group]
sought_after= [Combis[x] for x in sought_after]
#print(sought_after[:4])
Cl_number= len(sought_after)
Likes= [z for z in it.chain(*[[Construct[el[0]][el[1]][y] for y in el[2]] for el in sought_after])]
Likes= np.array([x for x in Likes if len(x) == sum(Sizes)])
Likes= np.mean(Likes, axis= 0).T
dense_plot= ff.create_distplot([Likes], [selected_group])
dense_plot['layout'].update(title='<b>likelihood density. N vertices: {} .N_clusters: {}</b>'.format(Structure_vertice,Cl_number))
iplot(dense_plot)
interact(plot_density,selected_group= [x for x in label_select.keys()])
Failed to display Jupyter Widget of type interactive
.
If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.
If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.
<function __main__.plot_density>