Notebook

In [1]:

import scipy
import numpy as np
import pandas as pd
import itertools as it

from math import sin
import collections

def recursively_default_dict():
        return collections.defaultdict(recursively_default_dict)

from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import scale

from scipy.stats.stats import pearsonr 

from scipy.stats import invgamma 
from scipy.stats import beta
import matplotlib.pyplot as plt

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

from sklearn.metrics import silhouette_samples, silhouette_score


from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
init_notebook_mode(connected=True)

import Lab_modules.StructE_tools as Ste
from Lab_modules.Generate_freq_vectors import generate_vectors_Beta

In [ ]:

In [2]:

Nbranches= 4
L= 150
n= 100
rangeA= [1,2.5]
rangeB = [.1,.6]
steps= 20
n_comp = 100
density= 50


vector_lib= generate_vectors_Beta(L,n,rangeA,rangeB,steps,n_comp)

pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(vector_lib)
features = pca.transform(vector_lib)


#features, vector_lib= generate_Branches_Beta(4,50,L,n,rangeA,rangeB,steps,n_comp)
print(features.shape)
print(vector_lib.shape)

(2000, 100)
(2000, 150)

In [3]:

### Functions to manipulate genetic structure.

def sin_prior(coords,target,vector2,angle,fst_max= 0.2,passport= False):
    ID= 'sinusoid'
    
    coords[target[0]] = coords[target[0]] - [(fst_max * 10 - 1) * sin(angle) * x for x in vector2]
    
    if passport:
        return coords,ID
    else:
        return coords


def linear_prior(coords,target,vector2,angle,region= [-5,5],slope= 1,passport= False):
    ID= 'linear'
    
    if angle >= region[0] and angle <= region[1]:
        progression= abs(angle - region[0]) / (region[1] - region[0])
        coords[target[0]] = coords[target[0]] + [progression * x * slope for x in vector2]
    
    if passport:
        return coords,ID
    else:
        return coords


def introgression_prior(coords,target,vector2,angle, region,passport= False):
    ID= 'introgression'
    
    if angle >= region[0] and angle <= region[1]:
        coords[target[0]] = coords[1]
    
    if passport:
        return coords,ID
    else:
        return coords
    

def alien_prior_I(coords,target,vector2,angle,fst= .2,region= [-5,5],passport= False):
    ID= 'alien I'
    
    if angle >= region[0] and angle <= region[1]:
        coords[target[0]] = coords[target[0]] + [(10 * fst - 1) * x for x in vector2]
        #coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
        #coords[target[0]] = coords[len(coords) - 1]
    else:
        coords[target[0]] = coords[target[1]]
    
    if passport:
        return coords,ID
    else:
        return coords


def alien_prior_II(coords,target,vector2,angle,fst= .2,region= [-5,5],passport= False):
    ID= 'alien II'
    
    if angle >= region[0] and angle <= region[1]:
        coords[target[0]] = coords[target[0]] + [(10 * fst - 1) * x for x in vector2]
        #coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
        #coords[target[0]] = coords[len(coords) - 1]
    
    if passport:
        return coords,ID
    else:
        return coords


def alien_prior_III(coords,target,vector2,angle,fst_a= 0.2,fst_b= .2,region= [-5,5],passport= False):
    ID= 'alien III'
    
    if angle >= region[0] and angle <= region[1]:
        coords[target[0]] = coords[target[0]] + [(10 * fst_b-1) * x for x in vector2]
        #coords[target[0]] = coords[target[0]] + [sin(angle) * x for x in vector2] remember to try this though
        #coords[target[0]] = coords[len(coords) - 1]
    else:
        coords[target[0]] = coords[target[0]] + [(10 * fst_a-1) * x for x in vector2]
    
    if passport:
        return coords,ID
    else:
        return coords

In [ ]:

In [4]:

from Generate_samples import Gen_samples, Check_Path, plot_GenFst

Sizes= [100,100,100,100]

Npops= len(Sizes)

range_diff= [-10,10]

prior_kwargs= {
    'fst_a': .0,
    'fst_b': 0.3,
    'region': [-3,-1]
}

prior_func= alien_prior_III

fig, Pops, prior= Check_Path(Npops,vector_lib,prior_func,prior_kwargs,Pops= [],random= True,n_comp= L,range_diff= range_diff,steps= 100)

iplot(fig)

C:\Users\jgarcia\Desktop\Jupyter_stuff\Genetic-data-analysis\StructE_tools.py:398: RuntimeWarning:

invalid value encountered in double_scalars

In [ ]:

In [5]:

# (Pops,Sizes,vector_lib,return_pca= False,n_comp= 100,prior= 'sinusoid',range_diff= [-10,10],steps= 100)

SequenceStore, Fst_windows= Gen_samples(Pops,Sizes,vector_lib,prior_func,prior_kwargs,range_diff= range_diff,steps= 100)

...

C:\Users\jgarcia\Desktop\Jupyter_stuff\Genetic-data-analysis\StructE_tools.py:398: RuntimeWarning:

invalid value encountered in double_scalars

Done.

In [6]:

### Check that we got what we wanted.

plot_GenFst(Fst_windows,Npops,1)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-c592d4660d6b> in <module>()
      1 ### Check that we got what we wanted.
      2 
----> 3 plot_GenFst(Fst_windows,Npops,1)

~\Desktop\Jupyter_stuff\Genetic-data-analysis\Generate_samples.py in plot_GenFst(Fst_lib, Npops, Chr)
    176     mode= 'markers',
    177     name= '{}'.format([x for x in it.combinations(range(Npops),2)][i])
--> 178     ) for i in range(len([x for x in it.combinations(range(Npops),2)]))
    179     ]
    180 

~\Desktop\Jupyter_stuff\Genetic-data-analysis\Generate_samples.py in <listcomp>(.0)
    176     mode= 'markers',
    177     name= '{}'.format([x for x in it.combinations(range(Npops),2)][i])
--> 178     ) for i in range(len([x for x in it.combinations(range(Npops),2)]))
    179     ]
    180 

NameError: name 'Fst_windows' is not defined

Statistics on simulated data sets.¶

i. AMOVA¶

In [ ]:

In [2]:

### Perform Distance and association analysis on the data sets generated

### Define reference and admixed associations:
### for the purpose of this analysis exploration will be driven by
### structure distance profiles. Since KDE will be used for exploration, 
### a set of accessions can be analysed that do not contribute to 
### distance profiles.

admx_N= 50
admx_indx= np.random.choice(range(sum(Sizes)),admx_N,replace= False)

ref_indx= [x for x in range(sum(Sizes)) if x not in admx_indx]

labels_pops= np.repeat(range(len(Sizes)),Sizes)

##

refs_lib= {x:[i for i in range(sum(Sizes)) if labels_pops[i] == x] for x in range(len(Sizes))}

admx_lib= {(max(refs_lib.keys()) + 1): admx_indx}


admx_lib.update(refs_lib)
Geneo= admx_lib

### Define parameters and libraries of analyses.
DIMr = 'PCA'
Bandwidth_split= 30
ncomp_local= 4
clsize= 15

Results = {x:recursively_default_dict() for x in SequenceStore.keys()}

Clover= []
Coordinates= []
Clusters_coords= []

Dist_vectors= []
Struct_dist_vectors= recursively_default_dict()

Construct = recursively_default_dict()
Distances_vectors= recursively_default_dict()
PC_var= recursively_default_dict()
center_distances= []

Ref_stats= []


for CHR in SequenceStore.keys():
    print('going on CHR: '+ str(CHR))
    for c in SequenceStore[CHR].keys():
        
        print('data set: {}'.format(c))
        ### PCA and MeanShift of information from each window copied from *FM36_Galaxy.py.
        Sequences = SequenceStore[CHR][c]
               
        Sequences= np.nan_to_num(Sequences)
        
        print(Sequences.shape)
        
        #### Dimensionality reduction
        
        if DIMr == 'PCA':
            pca = PCA(n_components=ncomp_local, whiten=False,svd_solver='randomized').fit(Sequences)
            data = pca.transform(Sequences)
            PC_var[CHR][c]= [x for x in pca.explained_variance_]
        
        if DIMr == 'NMF':
            from sklearn.decomposition import NMF
            data = NMF(n_components=ncomp_local, init='random', random_state=0).fit_transform(Sequences)
        
        Accurate = []
        
        params = {'bandwidth': np.linspace(np.min(data), np.max(data),Bandwidth_split)}
        grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
        
        ######################################
        ####### TEST global Likelihood #######
        ######################################
        Focus_labels = [z for z in it.chain(*refs_lib.values())]
        
        #### Mean Shift approach
        ## from sklearn.cluster import MeanShift, estimate_bandwidth

        bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(Focus_labels))
        if bandwidth <= 1e-3:
            bandwidth = 0.1

        ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=15)
        ms.fit(data[Focus_labels,:])
        labels = ms.labels_
        

        Tree = {x:[Focus_labels[y] for y in range(len(labels)) if labels[y] == x] for x in [g for g in list(set(labels)) if g != -1]}
        Keep= [x for x in Tree.keys() if len(Tree[x]) > clsize]
        
        Tree= {x:Tree[x] for x in Keep}
        
        print('N clusters: {}, Sizes: {}'.format(len(Tree),[len(x) for x in Tree.values()]))
        SpaceX = {x:data[Tree[x],:] for x in Tree.keys()}
        
        ### Extract MScluster likelihood by sample
        
        for hill in SpaceX.keys():
            
            grid.fit(data[Tree[hill],:])
            
            # use the best estimator to compute the kernel density estimate
            kde = grid.best_estimator_
            
            P_dist = kde.score_samples(data[Tree[hill],:])
            Dist = kde.score_samples(data)
            P_dist= np.nan_to_num(P_dist)
            Dist= np.nan_to_num(Dist)
            if np.std(P_dist) == 0:
                Dist= np.array([int(Dist[x] in P_dist) for x in range(len(Dist))])
            else:
                Dist = scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(Dist)
            Dist= np.nan_to_num(Dist)
            Construct[CHR][c][hill] = Dist
        
            ######################################### 
        ############# AMOVA ################
            #########################################
        
        Who = [x for x in range(len(labels)) if labels[x] != -1 and labels[x] in Keep]
        labels = [labels[x] for x in Who]
        Who= [Focus_labels[x] for x in Who]
        
        AMOVA,Cig = Ste.findPhiPT(Sequences[Who,:],labels,n_boot=0)
        
        ########################################################################
        ############## Distances ###############################################
        ########################################################################
        
        Dist_vectors= Ste.Distance_profiles(data,Tree,1000,Bandwidth_split)
        
        if Dist_vectors:
            Distances_vectors[CHR][c]= Dist_vectors
            
        Structures= Ste.Structure_profiles(data,Tree,1000,Bandwidth_split)
        if Structures:
            Struct_dist_vectors[CHR][c]= Structures
        
        ################ Other stats
        SIL = silhouette_score(data[Who,:],labels)
        
        df_between = len(Tree) - 1
        df_within = sum([len(Tree[x]) for x in Tree.keys()]) - len(Tree)
        df_total = sum([len(Tree[x]) for x in Tree.keys()]) - 1
        
        Total = data[Who,:]
        mu = np.mean(Total)
        sd = np.std(Total)
        Total = (Total - mu) / sd
        
        T = np.sqrt(((Total - Total.mean(axis=0))**2).sum(axis=1)).sum()**2 / len(Total)
        Y = (np.sqrt(((Total - Total.mean(axis=0))**2).sum(axis = 1))**2).sum()
        
        SS_between = -T
        SS_within = Y        
        
        t_s = np.empty([len(Total),0])
        
        for gp in Tree.keys():
            Gp = Total[[x for x in range(Total.shape[0]) if labels[x] == gp],:]
            Gp = (Gp - mu) / sd
            SS_between += (np.sqrt(((Gp - Total.mean(axis=0))**2).sum(axis = 1)).sum()**2 / len(Gp))
            SS_within -= (np.sqrt(((Gp - Total.mean(axis=0))**2).sum(axis = 1)**2).sum() / len(Gp))
            t_s = np.append(t_s,np.sqrt(((Gp - Gp.mean(axis=0))**2)).sum(axis = 1))
            
        
        SS_total = Y - T
        MS_between = SS_between / df_between
        MS_within = SS_within / df_within
        
        sqrt_center_dists= np.sqrt(((Total - Total.mean(axis=0))**2)).sum(axis = 1)
        
        t_stat,p_val = scipy.stats.ttest_rel(t_s,sqrt_center_dists)
        
        F = MS_between / float(MS_within)
        N_PC= len(Tree)
        
        eta_sqrd = SS_between / SS_total
        om_sqrd = (SS_between - (df_between * MS_within))/(SS_total + MS_within)
        
        Results[CHR][c] = [N_PC,F,om_sqrd,MS_between,t_stat,AMOVA,SIL,len(Tree),]

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-73ed7514efd0> in <module>()
      9 
     10 admx_N= 50
---> 11 admx_indx= np.random.choice(range(sum(Sizes)),admx_N,replace= False)
     12 
     13 ref_indx= [x for x in range(sum(Sizes)) if x not in admx_indx]

NameError: name 'Sizes' is not defined

In [14]:

Fst_pairs= [i for i in it.combinations(range(len(Sizes)),2)]

Fst_lib= []
for pair in range(len(Fst_pairs)):
    Fst_crawl= [[Fst_windows[Chr][n].fst[pair] for n in Fst_windows[Chr].keys()] for Chr in Fst_windows.keys()]

    Fst_crawl= [z for z in it.chain(*Fst_crawl)]
    
    Fst_lib.append(Fst_crawl)

Other_stats= [[[Chr,wind,*Results[Chr][wind]] for wind in Results[Chr].keys()] for Chr in Results.keys()]
Other_stats= np.array([y for y in it.chain(*Other_stats)])

print(Other_stats.shape)

# columns [CHR,window,N_PC,F,om_sqrd,MS_between,t_stat,AMOVA,SIL]

fig_data= [go.Scatter(
x= Fst_lib[i],
y= Other_stats[:,7],
mode= 'markers',
name= str(Fst_pairs[i])
) for i in range(len(Fst_pairs))]

layout = go.Layout(
    title= 'Stats',
    yaxis=dict(
        title='AMOVA'),
    xaxis=dict(
        title='Manipulated Fst')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

(40, 9)

In [15]:

fig_data= [go.Scatter(
x= Other_stats[:,8],
y= Other_stats[:,7],
mode= 'markers',
name= 'Results'
)]

layout = go.Layout(
    title= 'Stats',
    yaxis=dict(
        title='AMOVA'),
    xaxis=dict(
        title='SIL')
)

fig= go.Figure(data=fig_data, layout=layout)
iplot(fig)

In [ ]:

In [16]:

#####
KDE_stats= [[[Construct[Chr][wind][gp] for gp in sorted(Construct[Chr][wind].keys())] for wind in sorted(Construct[Chr].keys())] for Chr in Construct.keys()]
KDE_stats= np.array([y for y in it.chain(*[z for z in it.chain(*KDE_stats)])])

KDE_stats= np.array(KDE_stats)

n_comp = 4

pca1 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
likelihood_feats = pca1.fit_transform(KDE_stats)
var_comps= pca1.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pca1.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(likelihood_feats.shape))

## perform MeanShift clustering.
iu1 = np.triu_indices(KDE_stats.shape[0])

bandwidth = estimate_bandwidth(likelihood_feats, quantile=0.20)
params = {'bandwidth': np.linspace(np.min(likelihood_feats), np.max(likelihood_feats),30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=15)
ms.fit(likelihood_feats)
labels1 = ms.labels_
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}

MS_centroids= [np.mean(likelihood_feats[label_select[z],:],axis= 0) for z in label_select.keys()]
MS_pair_dist= pairwise_distances(MS_centroids,metric= 'euclidean')
#MS_pair_dist= MS_pair_dist[iu_control]



fig_data= [go.Scatter3d(
        x = likelihood_feats[label_select[i],0],
        y = likelihood_feats[label_select[i],1],
        z = likelihood_feats[label_select[i],2],
        type='scatter3d',
        mode= "markers",
        name= 'Cl: {}'.format(i),
        text= ['gp: {}'.format(i) for x in label_select[i]],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
        "opacity": 1
      }
    ) for i in label_select.keys()]



layout = go.Layout(
    title= 'PCA red + MS clustering KDE profiles across windows',
    
    scene= Scene(
    yaxis=dict(
        title='{}'.format(round(var_comps[1],3))),
    xaxis=dict(
    title= '{}'.format(round(var_comps[0],3))),
    zaxis=dict(
    title= '{}'.format(round(var_comps[2],3))))
)


fig = go.Figure(data=fig_data,layout= layout)
iplot(fig)

PC1: 0.3; PC2: 0.24; PC3: 0.017; PC4: 0.013
features shape: (110, 4)

Distances¶

In [ ]:

In [17]:

### Converting distances to fst's.

### Select pre and post processing measures. 
Eigen = False
Scale= False
Center= True

MixL= False # select if to mix N_Pops or not.
Length_increment= False
L_step= 5
MixP= True # select if to mix lengths or not. 
Pairs= False # select if comparing Pairs of distances or the distances themselves
Control_inc= True
Predict= False

length_haps= vector_lib.shape[1]
length_range= [75,vector_lib.shape[1]]
length_step= 10

pop_max= 8 # Number of pops

n_comp= 10 # components to keep following PCA

Iter= 20 # repeats

N_sims= 100 # number of haplotypes to generate from each pop in the unbiased scenario.

#### Predict
predicted= []

#def controled_fsts(vector_lib,Eigen,length_haps,Scale,Center,N_pops,n_comp,Iter,N_sims,MixL,MixP,Pairs):
lengths_vector= []

### Control set to include in the transformation:
control_vecs= np.random.choice(range(vector_lib.shape[0]),2)
control_labels= np.repeat([0,1],N_sims)
### Control set distances
control_even_distances= []
control_bias_distances= []

### store distances between centroids
biased_pairwise= []
unbiased_pairwise= []
corrected_pairwise= []

### store PC projection:
dist_PC_even= {x:[] for x in range(n_comp)}
dist_PC_bias= {x:[] for x in range(n_comp)}
dist_PC_corrected= {x:[] for x in range(n_comp)}

### store increemental PC distances
dist_increment_even= {x:[] for x in range(1,n_comp)}
dist_increment_bias= {x:[] for x in range(1,n_comp)}    

### store fsts
fst_store= []


### proceed.

for rep in range(Iter):
    
    if MixP:
        N_pops= np.random.choice(range(3,pop_max),1,replace= False)[0]
    else: 
        N_pops= pop_max
    
    if MixL:
        length_haps= np.random.choice(length_range,1)[0]
    
    if Length_increment:
        length_haps= int(length_range[0] + L_step * np.floor(rep / L_step))
    
    ## Population Sizes and labels
    bias_scheme= np.random.choice(range(25,200),N_pops,replace= False)
    unbiased_sheme= np.repeat(N_sims,N_pops)

    bias_labels= np.repeat(np.array([x for x in range(N_pops)]),bias_scheme)
    unbias_labels= np.repeat(np.array([x for x in range(N_pops)]),unbiased_sheme)

    ### triangular matrices extract.
    iu1= np.triu_indices(N_pops,1) # for centroid comparison

    iu_unbias= np.triu_indices(sum(unbiased_sheme),1)
    iu_bias= np.triu_indices(sum(bias_scheme),1)
    
    iu_control= np.triu_indices(2,1)
    
    Pops= np.random.choice(vector_lib.shape[0],N_pops,replace= False)
    print('Iter: {}, vectors selected: {}, hap length: {}'.format(rep,Pops,length_haps))
    ########## FST

    freqs_selected= vector_lib[Pops,:length_haps]
    Pairwise= Ste.return_fsts2(freqs_selected)

    #fsts_compare = scale(Pairwise.fst)
    fsts_compare= Pairwise.fst
    if Pairs:
        t= fsts_compare
        fsts_compare= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
    
    fst_store.extend(fsts_compare)

    ## lengths
    lengths_vector.extend([length_haps] * len(fsts_compare))
    
    #########################################################
    ########### PCA ####################################
    #########################################################
    ### control sample
    
    control_data= []

    for k in range(2):

        probs= vector_lib[control_vecs[k],:length_haps]
        probs[probs < 0] = 0
        probs[probs > 1] = 1
        m= unbiased_sheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]
        
        control_data.extend(Haps)

    control_data= np.array(control_data)

    #### generate data and perform PCA.
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:length_haps]
        probs[probs < 0] = 0
        probs[probs > 1] = 1
        m= unbiased_sheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]

        data.extend(Haps)

    data1= np.array(data)

    if Scale:
        data1= scale(data1)
    
    if Control_inc:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data1]))
        control_unbias_feat= pca.transform(control_data)
    else:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data1)
    
    feat_unbias= pca.transform(data1)

    if Eigen:
        feat_unbias= feat_unbias * pca.explained_variance_ratio_

    ####### centroid comparison
    #### Controls
    if Control_inc:
        control_centroids= [np.mean(control_unbias_feat[[y for y in range(control_unbias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
        control_centroids= np.array(control_centroids)

        unbias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
        unbias_control_dist= unbias_control_dist[iu_control]

        control_even_distances.extend(unbias_control_dist)

    ####
    unbias_centroids= [np.mean(feat_unbias[[y for y in range(feat_unbias.shape[0]) if unbias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    unbias_centroids= np.array(unbias_centroids)

    unbias_pair_dist= pairwise_distances(unbias_centroids,metric= 'euclidean')
    unbias_pair_dist= unbias_pair_dist[iu1]
    
    
    if Predict:
        fst_pred= [np.exp(m_coeff*np.log(x) + b) for x in unbias_pair_dist]
        predicted.extend(fst_pred)
        #print(np.array([fst_pred,fsts_compare]).T)
    
    #unbias_pair_dist= scale(unbias_pair_dist)
    unbiased_pairwise.extend(unbias_pair_dist)
    
    #################################################
    ############## biased sample

    #### generate data and perform PCA
    data= []

    for k in range(N_pops):

        probs= vector_lib[Pops[k],:]

        m= bias_scheme[k]
        Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(length_haps)] for acc in range(m)]

        data.extend(Haps)

    data2= np.array(data)

    if Scale:
        data2= scale(data2)
    
    if Control_inc:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack([control_data,data2]))
        control_bias_feat= pca.transform(control_data)
    else:
        pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(data2)
    
    feat_bias= pca.transform(data2)

    if Eigen:
        feat_bias= feat_bias * pca.explained_variance_ratio_

    #### Centroid distances
    #### Controls
    if Control_inc:
        control_centroids= [np.mean(control_bias_feat[[y for y in range(control_bias_feat.shape[0]) if control_labels[y] == z],:],axis= 0) for z in range(2)]
        control_centroids= np.array(control_centroids)

        bias_control_dist= pairwise_distances(control_centroids,metric= 'euclidean')
        bias_control_dist= bias_control_dist[iu_control]

        control_bias_distances.extend(bias_control_dist)
    
    bias_centroids= [np.mean(feat_bias[[y for y in range(feat_bias.shape[0]) if bias_labels[y] == z],:],axis= 0) for z in range(N_pops)]
    bias_centroids= np.array(bias_centroids)

    bias_pair_dist= pairwise_distances(bias_centroids,metric= 'euclidean')
    bias_pair_dist= bias_pair_dist[iu1]
    #bias_pair_dist= scale(bias_pair_dist)
    if Pairs:
        t= bias_pair_dist
        bias_pair_dist= [min([t[y] for y in z]) / max([t[y] for y in z]) for z in it.combinations(range(len(t)),2)]
    
    biased_pairwise.extend(bias_pair_dist)

    
    

t= np.array([
fsts_compare,
unbias_pair_dist,
bias_pair_dist
]).T


Size= length_haps
fst_lm_range= [.02,.3]

Lindexes= [x for x in range(len(lengths_vector)) if lengths_vector[x] == Size and fst_store[x] >= fst_lm_range[0] and fst_store[x] <= fst_lm_range[1]]
y_true= [np.log(biased_pairwise[x]) for x in Lindexes]
m_coeff,b= np.polyfit(y_true,[np.log(fst_store[x]) for x in Lindexes],1)

Iter: 0, vectors selected: [ 80  72 108  94], hap length: 150

C:\Users\jgarcia\Desktop\Jupyter_stuff\Distance_stats\StructE_tools.py:398: RuntimeWarning:

invalid value encountered in double_scalars

Iter: 1, vectors selected: [159 138 146   6 158 118], hap length: 150
Iter: 2, vectors selected: [132 134  42  72], hap length: 150
Iter: 3, vectors selected: [ 26  72 181  93 177 165], hap length: 150
Iter: 4, vectors selected: [ 71  39 120], hap length: 150
Iter: 5, vectors selected: [ 22 116 123 184 111], hap length: 150
Iter: 6, vectors selected: [ 84  90 172  78 137], hap length: 150
Iter: 7, vectors selected: [ 46 118  23  19  27 127], hap length: 150
Iter: 8, vectors selected: [194 143 170  13  17   0  14], hap length: 150
Iter: 9, vectors selected: [168 120 177   8], hap length: 150
Iter: 10, vectors selected: [189  14  73  31  85 123  33], hap length: 150
Iter: 11, vectors selected: [ 88 123  83 129 142], hap length: 150
Iter: 12, vectors selected: [ 57  74 171  39  93], hap length: 150
Iter: 13, vectors selected: [ 45  94 167 107 199 142 143], hap length: 150
Iter: 14, vectors selected: [ 94  77 104], hap length: 150
Iter: 15, vectors selected: [163  19 172], hap length: 150
Iter: 16, vectors selected: [178 164  76  81], hap length: 150
Iter: 17, vectors selected: [133  15  73], hap length: 150
Iter: 18, vectors selected: [130  65 101 105], hap length: 150
Iter: 19, vectors selected: [ 32  74  85 140  93], hap length: 150

Structure distance profiles¶

In [26]:

Structure_vertice= [2]

Combis= []
for chromo in Struct_dist_vectors.keys():
    for winds in Struct_dist_vectors[chromo].keys():
        for grp in Struct_dist_vectors[chromo][winds].keys():
            if len(grp) in Structure_vertice:
                Combis.append([chromo,winds,grp])

#Struct_distance_stats= np.array([y for y in it.chain(*Struct_distance_stats)])
Struct_distance_stats= [[[Struct_dist_vectors[Chr][wind][gp] for gp in sorted(Struct_dist_vectors[Chr][wind].keys()) if len(gp) in Structure_vertice] for wind in sorted(Struct_dist_vectors[Chr].keys())] for Chr in Struct_dist_vectors.keys()]
Struct_distance_stats= np.array([y for y in it.chain(*[z for z in it.chain(*Struct_distance_stats)])])

Struct_distance_stats= np.array(Struct_distance_stats)

In [27]:

n_comp = 4

pcaS = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
Struct_dist_feats = pcaS.fit_transform(Struct_distance_stats)
var_compsQ= pcaS.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pcaS.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(Struct_dist_feats.shape))

bandwidth = estimate_bandwidth(Struct_dist_feats, quantile=0.2)
params = {'bandwidth': np.linspace(np.min(Struct_dist_feats), np.max(Struct_dist_feats),30)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False, min_bin_freq=25)
ms.fit(Struct_dist_feats)
labels1 = ms.labels_
#labels1= [int(1 in x[2]) for x in Combis]
#labels1= [len(x[2]) for x in Combis]
#print(Combis[:10])
label_select = {y:[x for x in range(len(labels1)) if labels1[x] == y] for y in sorted(list(set(labels1))) if y != -1}


#### Let's plot the first 3 coordinates nonetheless.
####
fig_data= [go.Scatter3d(
        x = Struct_dist_feats[label_select[i],0],
        y = Struct_dist_feats[label_select[i],1],
        z = Struct_dist_feats[label_select[i],2],
        type='scatter3d',
        mode= "markers",
        name= 'Cl: {}'.format(i),
        text= ['gp: {}'.format(i) for x in label_select[i]],
        marker= {
        'line': {'width': 0},
        'size': 4,
        'symbol': 'circle',
        "opacity": 1
      }
    ) for i in label_select.keys()]




layout = go.Layout(
    title= 'PCA red + MS clustering PCA dist profiles across windows',
    
    scene= Scene(
    yaxis=dict(
        title='{}'.format(round(var_compsQ[1],3))),
    xaxis=dict(
    title= '{}'.format(round(var_compsQ[0],3))),
    zaxis=dict(
    title= '{}'.format(round(var_compsQ[2],3))))
)



fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)

PC1: 0.894; PC2: 0.06; PC3: 0.014; PC4: 0.011
features shape: (100, 4)

In [ ]:

In [22]:

def plot_density(selected_group):
    Distances= Struct_distance_stats[label_select[selected_group],:]
    print(Distances.shape)
    sum_profile= np.mean(Distances,axis=0)
    
    fig_data= [go.Scatter(
    x= [np.exp(m_coeff * np.log(x) + b) for x in np.linspace(0,8,Distances.shape[1])],
    y= sum_profile,
    mode= 'markers',
    name= 'Results'
    )]

    layout = go.Layout(
        title= 'Stats',
        yaxis=dict(
            title='density'),
        xaxis=dict(
            title= 'Fst')
    )

    fig= go.Figure(data=fig_data, layout=layout)
    iplot(fig)



interact(plot_density,selected_group= label_select.keys())    

Failed to display Jupyter Widget of type interactive.

If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean that the widgets JavaScript is still loading. If this message persists, it likely means that the widgets JavaScript library is either not installed or not enabled. See the Jupyter Widgets Documentation for setup instructions.

If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.

Out[22]:

<function __main__.plot_density>

In [23]:

Test= Struct_distance_stats.T
Novel= KDE_stats.T


n_comp = 4

pca1 = PCA(n_components=n_comp, whiten=False,svd_solver='randomized')
likelihood_feats = pca1.fit_transform(Novel)
var_comps= pca1.explained_variance_ratio_

print("; ".join(['PC{0}: {1}'.format(x+1,round(pca1.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
print('features shape: {}'.format(likelihood_feats.shape))


def kde_distances(gp):
    if gp == -1:
        apparel= Geneo
        fig_data= [go.Scatter3d(
                x = likelihood_feats[apparel[i],0],
                y = likelihood_feats[apparel[i],1],
                z = likelihood_feats[apparel[i],2],
                type='scatter3d',
                mode= "markers",
                name= 'Cl: {}'.format(i),
                text= ['gp: {}'.format(i) for x in apparel[i]],
                marker= {
                'line': {'width': 0},
                'size': 4,
                'symbol': 'circle',
                "opacity": 1
              }
            ) for i in apparel.keys()]
    else:
        sought_after= label_select[gp]
        print(len(sought_after))
        sought_after= [Combis[x] for x in sought_after]
        #print(sought_after[:4])
        
        Likes= [z for z in it.chain(*[[Construct[el[0]][el[1]][y] for y in el[2]] for el in sought_after])]
        
        Likes= np.array([x for x in Likes if len(x) == sum(Sizes)])
        
        print(Likes.shape)
        Likes= np.mean(Likes, axis= 0).T
        print(Likes.shape)
        
        fig_data= [go.Scatter3d(
                x = likelihood_feats[:,0],
                y = likelihood_feats[:,1],
                z = likelihood_feats[:,2],
                type='scatter3d',
                mode= "markers",
                marker= {
                'color': Likes,
                'colorbar': go.ColorBar(
                    title= 'ColorBar'
                ),
                'colorscale': 'Viridis',
                'line': {'width': 0},
                'size': 4,
                'symbol': 'circle',
                "opacity": 1
              }
            )]
    

    layout = go.Layout(
        title= 'PCA red + MS clustering KDE profiles across windows',

        scene= Scene(
        yaxis=dict(
            title='{}'.format(round(var_comps[1],3))),
        xaxis=dict(
        title= '{}'.format(round(var_comps[0],3))),
        zaxis=dict(
        title= '{}'.format(round(var_comps[2],3))))
    )


    fig = go.Figure(data=fig_data,layout= layout)
    iplot(fig)

interact(kde_distances,gp= [-1,*[x for x in label_select.keys()]])

PC1: 0.299; PC2: 0.242; PC3: 0.013; PC4: 0.012
features shape: (300, 4)

Failed to display Jupyter Widget of type interactive.

If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.

Out[23]:

<function __main__.kde_distances>

In [36]:

def plot_density(selected_group):
    sought_after= label_select[selected_group]
    
    sought_after= [Combis[x] for x in sought_after]
    #print(sought_after[:4])
    Cl_number= len(sought_after)

    Likes= [z for z in it.chain(*[[Construct[el[0]][el[1]][y] for y in el[2]] for el in sought_after])]

    Likes= np.array([x for x in Likes if len(x) == sum(Sizes)])
    
    Likes= np.mean(Likes, axis= 0).T
    
    dense_plot=  ff.create_distplot([Likes], [selected_group])
    dense_plot['layout'].update(title='<b>likelihood density. N vertices: {} .N_clusters: {}</b>'.format(Structure_vertice,Cl_number))
    
    iplot(dense_plot)

interact(plot_density,selected_group= [x for x in label_select.keys()]) 

Failed to display Jupyter Widget of type interactive.

If you're reading this message in another frontend (for example, a static rendering on GitHub or NBViewer), it may mean that your frontend doesn't currently support widgets.

Out[36]:

<function __main__.plot_density>

In [ ]: