import scipy
import numpy as np
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import estimate_bandwidth
import pandas as pd
from scipy import stats
from scipy.stats import beta
import matplotlib.pyplot as plt
import itertools as it
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
In a previous post, we went through the motions of classifying unlabelled haplotypes given representative samples of different populations.
However, we were dealing with a fairly clean scenario: one where all of our unlabelled haplotypes fell neatly within our reference distributions.
But, what if that was not the case? How do we identify novel genetic material that might be carried by our unlabelled accessions?
Here we shall begin to explore the added uses of Kernel Density Estimation i had mention before.
Our first step will be simulate our populations and then reduce the dimension of our data.
## Let's go ahead and define the density function of our allele frequencies.
# :: Refer to previous posts if you wish to visualize this density under the beta distriution ::
a, b = 1.5, .2
# number of populations to generate
N_pops= 4
# length of haplotypes
L= 200
# Size of each population
Sizes= [250,100,300]
labels= [0,1,2]
# number of unlabelled individuals to draw from each population:
n_unlab= {
0: 5,
1: 3,
2: 7,
3: 20
}
# population labels
label_vector= np.repeat(np.array([x for x in labels]),Sizes)
## save the probability vectors that will characterize each population:
prob_vectors= np.array([beta.rvs(a, b, size=L) for x in range(N_pops)])
data= []
for k in range(len(labels)):
probs= prob_vectors[k,:]
probs[(probs > 1)]= 1 ## probabilities exceeding 1 are trimmed.
m= Sizes[k]
Haps= [[np.random.choice([1,0],p= [1-probs[x],probs[x]]) for x in range(L)] for acc in range(m)]
data.extend(Haps)
data= np.array(data)
## create incognita haplotypes from both our known distributions as well as a fourth, uncharactrerized population.
admixed= {k:[[np.random.choice([1,0],p= [1-prob_vectors[k,:][x],prob_vectors[k,:][x]]) for x in range(L)] for acc in range(n_unlab[k])] for k in n_unlab.keys()}
# Principal component analysis of the data generated.
## Number of components to retain:
n_comp = 3
## Perform the PCA on the whole data set so as not to lose variation absent from our references populations.
pca = PCA(n_components=n_comp, whiten=False,svd_solver='randomized').fit(np.vstack((data,[y for y in it.chain(*admixed.values())])))
features = pca.transform(data)
print("Variance explained:")
print("; ".join(['PC{0}: {1}'.format(x+1,round(pca.explained_variance_ratio_[x],3)) for x in range(n_comp)]))
Variance explained: PC1: 0.196; PC2: 0.104; PC3: 0.021
Let's look at what we've got.
## Our reference populations
fig_data= [go.Scatter3d(
x = features[[x for x in range(sum(Sizes)) if label_vector[x] == i],0],
y = features[[x for x in range(sum(Sizes)) if label_vector[x] == i],1],
z = features[[x for x in range(sum(Sizes)) if label_vector[x] == i],2],
mode= "markers",
marker= {
'line': {'width': 1},
'size': 4,
'symbol': 'circle',
"opacity": 1
},
name= str(i)
) for i in labels]
## The incognita:
admx_t= [y for y in it.chain(*admixed.values())]
admx_t= np.array(admx_t)
admx_t= pca.transform(admx_t)
fig_data.append(
go.Scatter3d(
x = admx_t[:,0],
y = admx_t[:,1],
z = admx_t[:,2],
mode= "markers",
marker= {
'color':'rgb(0,0,0)',
'line': {'width': 1},
'size': 4,
'symbol': 'circle',
"opacity": 1
},
name= 'unlabelled'
))
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=fig_data, layout=layout)
iplot(fig)
In feature space, accessions drawn from our 'ghost' population stand out. How do we avoid classing them into our known populations?
We had previously used maximum normalized log-likelihoods to assign our haplotypes. KDE extracted log-likelihoods represent the density, in feature space, of a reference distribution at the selected coordinates.
By normalizing log-likelihoods across the dataset by those of the references only, we obtain an estimate of how different the density at a given coordinate is from the mean density across haplotypes pertaining to that reference population.
This normalization is possible in part because the kernels used to construct our KDEs are Gaussian themselves. It also works as a way of circumventing the sample size denpendency of KDE-derived likelihoods.
All of this considered, the method is quite straightforward: A lower threshold is set below which haplotypes are flagged as outliers.
## setting our lower limit.
Outlier_threshold= 1e-4
## stacking our data.
global_data= np.vstack((admx_t,features))
## calculating kernel bandwidth. A proxy of local differentiation allowed.
params = {'bandwidth': np.linspace(np.min(features), np.max(features),20)}
grid = GridSearchCV(KernelDensity(algorithm = "ball_tree",breadth_first = False), params,verbose=0)
Scores= []
for lab in labels:
Quanted_set= features[[x for x in range(len(label_vector)) if label_vector[x] == lab],:]
grid.fit(Quanted_set)
kde = grid.best_estimator_
P_dist = kde.score_samples(Quanted_set)
Fist = kde.score_samples(global_data)
## Normalizing log-likelihood estimates by those of the reference set.
Fist = scipy.stats.norm(np.mean(P_dist),np.std(P_dist)).cdf(Fist)
Scores.append(Fist)
Scores= np.array(Scores).T
new_labels= np.argmax(Scores,axis= 1)
below_threshold= [n for n in range(len(new_labels)) if np.amax(Scores,axis= 1)[n] < Outlier_threshold]
new_labels[below_threshold]= -1
print('First twenty assignments:')
print(new_labels[:20])
print(', '.join(['label {0}: {1}'.format(x,new_labels.tolist().count(x)) for x in list(set(new_labels))]))
First twenty assignments: [ 0 0 0 0 0 1 1 1 2 2 2 2 2 2 2 -1 -1 -1 -1 -1] label 0: 255, label 1: 103, label 2: 307, label -1: 20
names= ["0","1","2","outlier"]
fig_features= [go.Scatter3d(
x = global_data[[x for x in range(len(new_labels)) if new_labels[x] == i],0],
y = global_data[[x for x in range(len(new_labels)) if new_labels[x] == i],1],
z = global_data[[x for x in range(len(new_labels)) if new_labels[x] == i],2],
mode= "markers",
marker= {
'line': {'width': 1},
'size': 4,
'symbol': 'circle',
"opacity": 1
},
name= names[i]
) for i in list(set(new_labels))]
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=fig_features, layout=layout)
iplot(fig)
Our final classification. We have successfully identified material from a foreign population, while at the same time correctly assigning other haplotypes to populations known to us (the bearers of labels).
However, we can also observe that some haplotypes were wrongly labelled as outliers. This could be solved by lowering the outlier threshold, but that entails its own risks. Namely in regions where the outlier population is not as differentiated. In my opinion some degree of error will always exist. Classification should be treated an exploratory analysis, its errors to be identified and treated downstream - see post on cluster profiles.
Next up, Mis-labelling! Introgressions and what to do when the foreign material has seeped into our reference sets.