import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
import pandas as pd
from scipy import stats
from scipy.stats import beta
import matplotlib.pyplot as plt
import itertools as it
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
In this post we're going to deal with a common occurrence in biological datasets, one not often taken into account: local mis-labelling of reference accessions.
When estimating the origin of unlabelled accessions, we tend to assume (and could be right on occasion) that our reference populations are "pure". That is, that the misrepresentation of their genepool will have arisen from biases in sampling, which we attempt to correct for, but biases in sampling only.
In truth, that is rarely the case. Ancestors of our reference individuals might have migrated, by their own means or though human action, and come in contact with the ancestors of modern representatives of other populations. This is particularly true of domesticated species.
In addition, our data sets not being complete, we have no way of knowing if our modern references have not received material from representatives of cryptic populations yet uncharacterized.
On this post, we shall attempt to deal with these scenarios, and to see how this can be done using the same tools as in previous posts.
## Let's go ahead and define the density function of our allele frequencies.
# :: Refer to previous posts if you wish to visualize this density under the beta distriution ::
a, b = 1.5, .2
# number of populations to generate
N_pops= 4
# length of haplotypes
L= 200
# Size of reference populations
Sizes= [250,100,300]
labels= [0,1,2] # the fourth population is to be our unknown.
# number of unlabelled individuals to draw from each population:
n_unlab= {
0: 5,
1: 3,
2: 7,
3: 20
}
# Simulated crosses between distributions. This dictionary is built to indicate from whom a given pop will
# receive its mislabelled material, and how many haplotypes will have been contributed. In that order:
# Across = {target_pop: {Contributing_pop: N_contributions}}
Across= {
0:{
1: 3,
2: 5,
3: 4
},
1:{
0: 7,
2: 2
},
2:{
0: 15,
1: 4,
3: 3
}
}
# reference population labels
label_vector= np.repeat(np.array([x for x in labels]),Sizes)
## save the allelic frequency vectors that will characterize each population:
prob_vectors= np.array([beta.rvs(a, b, size=L) for x in range(N_pops)])
prob_vectors[prob_vectors > 1]= 1 ## probabilities exceeding 1 are trimmed.
## Drawing haplotypes.
data= []
for k in range(len(labels)):
probs= prob_vectors[k,:]
m= Sizes[k] - sum(Across[k].values())
Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]
data.extend(Haps)
## draw introgressed haplotypes using Across:
for bro in Across[k].keys():
haps= [[np.random.choice([1,0],p= [1-prob_vectors[bro,:][x],prob_vectors[bro,:][x]]) for x in range(L)] for acc in range(Across[k][bro])]
data.extend(haps)
data= np.array(data)
## create incognita haplotypes from both our known distributions as well as a fourth, uncharactrerized population.
admixed= {k:[[np.random.choice([1,0],p= [1-prob_vectors[k,:][x],prob_vectors[k,:][x]]) for x in range(L)] for acc in range(n_unlab[k])] for k in n_unlab.keys()}
# Principal component analysis of the data generated.
## Number of components to retain:
n_comp = 3
## Perform the PCA on the whole data set so as not to lose variation absent from our references populations.
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack((data,[y for y in it.chain(*admixed.values())])))
features = pca.transform(data)
print('-- PCA on haplotypes generated --')
print("Variance explained:")
print("; ".join(['PC{0}: {1}'.format(x+1,round(pca.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
-- PCA on haplotypes generated -- Variance explained: PC1: 0.163; PC2: 0.084; PC3: 0.025
# Plot the results
## Reference populations
fig_data= [go.Scatter3d(
x = features[[x for x in range(sum(Sizes)) if label_vector[x] == i],0],
y = features[[x for x in range(sum(Sizes)) if label_vector[x] == i],1],
z = features[[x for x in range(sum(Sizes)) if label_vector[x] == i],2],
mode= "markers",
marker= {
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": 1
},
name= str(i)
) for i in labels]
## Incognita:
admx_t= [y for y in it.chain(*admixed.values())]
admx_t= np.array(admx_t)
admx_t= pca.transform(admx_t)
fig_data.append(
go.Scatter3d(
x = admx_t[:,0],
y = admx_t[:,1],
z = admx_t[:,2],
mode= "markers",
marker= {
'color':'rgb(0,0,0)',
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": .8
},
name= 'unlabelled'
))
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)
Principal component analysis of the haplotypes generated in Block 1. Haplotypes in black lack population assignment. We also see some cases of mislabelling, which we created by using allele frequencies from other populations for some elements of each.
These mislabelled haplotypes will still be included in the KDE of the population they're labelled to.
Haplotypes will be assigned to the population presenting the highest score at it's location, and a lower threshold is set, such that observations bearing maximum scores below it will be classed as outliers.
## setting our lower limit.
Outlier_threshold= 1e-3
## stacking our data.
global_data= np.vstack((admx_t,features))
## calculating kernel bandwidth. A proxy of local differentiation allowed for.
params = {'bandwidth': np.linspace(np.min(features), np.max(features),20)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
## Estimate Kernel Density of each reference population, extract normalized values for every accession in data set.
Scores= []
for lab in labels:
Quanted_set= features[[x for x in range(len(label_vector)) if label_vector[x] == lab],:]
grid.fit(Quanted_set)
kde = grid.best_estimator_
P_dist = kde.score_samples(Quanted_set)
Fist = kde.score_samples(global_data)
## Normalizing log-likelihood estimates by those of the reference set.
Fist = scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(Fist)
Scores.append(Fist)
Scores= np.array(Scores).T
new_labels= np.argmax(Scores,axis= 1)
# Identify cases where all individual scores are below our threshold.
below_threshold= [n for n in range(len(new_labels)) if np.amax(Scores,axis= 1)[n] < Outlier_threshold]
new_labels[below_threshold]= -1
print('First twenty assignments:')
print(new_labels[:20])
print('label assignments:')
print(', '.join(['label {0}: {1}'.format(x,new_labels.tolist().count(x)) for x in list(set(new_labels))]))
First twenty assignments: [ 0 0 0 0 0 1 1 1 2 2 2 2 2 2 2 -1 -1 -1 -1 -1] label assignments: label 0: 265, label 1: 101, label 2: 292, label -1: 27
names= ["0","1","2","outlier"]
fig_features= [go.Scatter3d(
x = global_data[[x for x in range(len(new_labels)) if new_labels[x] == i],0],
y = global_data[[x for x in range(len(new_labels)) if new_labels[x] == i],1],
z = global_data[[x for x in range(len(new_labels)) if new_labels[x] == i],2],
mode= "markers",
marker= {
'line': {'width': 0},
'size': 4,
'symbol': 'circle',
"opacity": 1
},
name= names[i]
) for i in list(set(new_labels))]
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=fig_features, layout=layout)
iplot(fig)