In [15]:

# If installed from pip, import lostruct as ls will work
import lostruct.lostruct as ls

# PCoA from skbio.stats is the best implementation of R's MDS algorithm
from skbio.stats.ordination import pcoa

# Much of the output from CyVCF2 and lostruct are numpy arrays
import numpy as np

import pandas as pd
import plotly.express as px
from sklearn.manifold import MDS
import umap
import hdbscan
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [16]:

# Two VCF utility functions are proivded. get_samples() and get_landmarks()

# This will be the same order of the resulting data
samples = ls.get_samples("test_data/chr1-filtered.vcf.gz")
samples[0:5]

Out[16]:

['HM017-I', 'HM018', 'HM022-I', 'HM029', 'HM030']

In [17]:

# Utility function: Get list of landmarks (chromosome, scaffolds, contigs, etc..)
landmarks = ls.get_landmarks("test_data/chr1-filtered.vcf.gz")
landmarks[0:5]

Out[17]:

['chl_Mt', 'chr1', 'chr2', 'chr3', 'chr4']

In [18]:

# Docstrings are also provided
help(ls.get_samples)

Help on function get_samples in module lostruct.lostruct:

get_samples(vcf_file)
    Get the samples from a VCF/BCF file. This is the order the data will remain in as well.

In [19]:

# Parse VCF to get windows and positions of each SNP within each window
windows, positions = ls.parse_vcf("test_data/chr1-filtered.vcf.gz", "chr1", 95, ls.Window.SNP)
# ls.Window.SNP specifies window sizes are by SNP count. ls.Window.BP specifies windows are in base pair lengths.

# *** ls.Window.BP is not yet implemented, however. ***
# Please see: https://github.com/jguhlin/lostruct-py/issues/8

# Accumulate output of eigen_windows
result = list()
for x in windows:
    result.append(ls.eigen_windows(x, 10, 1))

# Convert to numpy array
result = np.vstack(result)

# Get PCA distances comparison matrix
pc_dists = ls.get_pc_dists(result)
# An additional mode, fastmath, is available. Trading some accuracy for a slight speed boost (~8%)
pc_dists = ls.get_pc_dists(result, fastmath=True)

# Get PCoA value of pc_dists matrix (this is equivalent to R's MDS)
# PLEASE NOTE: See section below: Working with Large Datasets
# For recommended ways to run pcoa
mds = pcoa(pc_dists)

/home/josephguhlin/anaconda3/envs/lostruct/lib/python3.8/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning:

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray

In [20]:

px.scatter(y=mds.samples["PC1"], title="MDS Coordinate 1 (y-axis) compared to Window (x-axis)")

In [21]:

px.scatter(y=mds.samples["PC2"], title="MDS Coordinate 2 (y-axis) compared to Window (x-axis)")

In [22]:

px.scatter(x=mds.samples["PC1"], y=mds.samples["PC2"], title="MDS Coordinate 1 (x-axis) and MDS Coordinate 2 (y-axis)")

Performing Analaysis Genome-Wide¶

In [ ]:

landmarks = ls.get_landmarks("test_data/complete_file.vcf.gz")

results = list()
snp_positions = list()

for landmark in landmarks:
    windows, positions = ls.parse_vcf("test_data/complete_file.vcf.gz", landmark, 95)
    for i, window in enumerate(windows):
        results.append(ls.eigen_windows(window, 10, 1))
        snp_positions.append(positions[i])

While the above will not work due to a missing file, it is the appropriate way to get the results for each window for all landmarks (chromosomes, scaffolds, contigs, etc...). Here, we keep track of snp_positions as well, and len(snp_positions) == len(results) so they can be further investigated.

The code will then remain the same:

In [ ]:

# Convert to numpy array
results = np.vstack(results)

# Get PCA distances comparison matrix
pc_dists = ls.get_pc_dists(results)

# Get PCoA value of pc_dists matrix (this is equivalent to R's MDS)
mds = pcoa(pc_dists)

Comparison to R Version¶

In [23]:

mds_coords = pd.read_csv("lostruct-results/mds_coords.csv")
np.corrcoef(mds.samples['PC1'], mds_coords['MDS1'].to_numpy())[0][1]
# R-value is:

Out[23]:

0.9971509982243155

In [24]:

px.scatter(x=mds.samples["PC1"], y=mds_coords['MDS1'])

Working with Large Datasets¶

In [25]:

# PCOA for reduced memory consumption and faster clustering
mds = pcoa(pc_dists, method="fsvd", inplace=True, number_of_dimensions=10)
np.corrcoef(mds.samples["PC1"], mds_coords['MDS1'].to_numpy())[0][1]

Out[25]:

-0.9972686515756828

In [26]:

px.scatter(y=[mds.samples["PC1"], mds_coords['MDS1']], title="")

In [27]:

px.scatter(x=mds.samples["PC1"], y=mds_coords['MDS1'])

Some looks at other methods of clustering / comparing¶

In [28]:

embedding = MDS(n_components=10, dissimilarity="precomputed", n_jobs=-1, n_init=32)
mds = embedding.fit_transform(pc_dists)
px.scatter(y=[mds[:,0], mds_coords['MDS1']], title="Blue is using Python MDS, Red is PCoA method")

In [29]:

import phate
phater = phate.PHATE(n_components=10, knn_dist='precomputed', mds_solver='smacof', mds='metric')
comparison_phate = phater.fit_transform(pc_dists)

Calculating PHATE...
  Running PHATE on precomputed distance matrix with 124 observations.
  Calculating graph and diffusion operator...
    Calculating affinities...
    Calculated affinities in 0.03 seconds.
  Calculated graph and diffusion operator in 0.03 seconds.
  Calculating optimal t...
    Automatically selected t = 12
  Calculated optimal t in 0.08 seconds.
  Calculating diffusion potential...
  Calculated diffusion potential in 0.15 seconds.
  Calculating metric MDS...
  Calculated metric MDS in 3.48 seconds.
Calculated PHATE in 3.77 seconds.

In [30]:

mds = pcoa(pc_dists)
px.scatter(y=[mds_coords['MDS1'], mds.samples["PC1"], comparison_phate[:,0]], title="Green is PHATE")
# https://github.com/KrishnaswamyLab/PHATE
# Moon, van Dijk, Wang, Gigante et al. Visualizing Transitions and Structure for Biological Data Exploration. 2019. Nature Biotechnology.

In [31]:

reducer = umap.UMAP()
embedding = reducer.fit_transform(pc_dists)
px.scatter(x=embedding[:, 0], y=embedding[:, 1])
# UMAP: https://umap-learn.readthedocs.io/en/latest/index.html
# McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018

In [32]:

hdbscan_labels = hdbscan.HDBSCAN().fit_predict(embedding)
px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=hdbscan_labels)
# hdbscan: https://hdbscan.readthedocs.io/en/latest/index.html

In [33]:

reducer = umap.UMAP(n_components=3)
embedding = reducer.fit_transform(pc_dists)
hdbscan_labels = hdbscan.HDBSCAN().fit_predict(embedding)
fig = px.scatter_3d(x=embedding[:, 0], y=embedding[:, 1], z=embedding[:, 2], color=hdbscan_labels, width=800, height=600)
fig.show()

The following is for future features, please disregard¶

Code used to generate the original random weights¶

If weights are urgently needed, please add a comment to the existing github issue

In [ ]:

# Code originally used to generate and save random weights. Saved and static now to feed into R and compare with lostruct R
weights = np.random.random_sample(len(samples))
weights = weights*10

outfh = open("test_data/random_weights.txt", "w")

outfh.write("{}\t{}\n".format("ID", "weight"))
for id,w in zip(samples, weights):
    outfh.write("{}\t{:.06f}\n".format(id, w))

del(outfh)
np.save("test_data/random_weights.npy", weights)
weights

In [ ]:

weights = np.load("test_data/random_weights.npy")
weights

In [ ]:

def get_pc_dists(windows, fastmath=False, w=1):
    """
    Calculate distances between window matrices.

    Works on only the upper triangle of the matrix, but clones the data into the lower half as well.
    """
    n = len(windows)

    vals = ls.l1_norm(np.asarray([x[2] for x in windows]))
    vals = vals.real.astype(np.float64)
    
    vecs = np.asarray([x[3] for x in windows])
    weights = w[:, np.newaxis]
    #sqrt_w = np.squeeze(np.repeat(np.sqrt(weights), result[0][3].shape[0], axis=1))
    sqrt_w = np.squeeze(np.sqrt(weights))
    print(sqrt_w.shape)
    vecs = np.multiply(vecs, sqrt_w)
    #vecs = np.multiply(vecs, sqrt_w.T)
    #print(vecs)

    if fastmath:
        comparison = ls.calc_dists_fastmath(n, vals, vecs)
    else:
        comparison = ls.calc_dists(n, vals, vecs)

    # Remove negatives... Can't be placed within Numba code
    comparison[comparison < 0] = 0

    # Get square root
    return np.sqrt(comparison)

In [ ]:

vecs.shape

In [ ]:

#result = list()
#for x in windows:
#    result.append(ls.eigen_windows(x, 10, weights))
#result = np.vstack(result)

pc_dists = get_pc_dists(result, fastmath=True, w=weights)
mds = pcoa(pc_dists)
mds_coords = pd.read_csv("lostruct-results/weights_mds_coords.csv")
print("Weights compared to Lostruct R:")
print(np.corrcoef(mds.samples['PC1'], mds_coords['MDS1'].to_numpy()))

In [ ]:

pc_dists

In [ ]:

def get_pc_dists(windows, fastmath=False, w=1):
    """
    Calculate distances between window matrices.

    Works on only the upper triangle of the matrix, but clones the data into the lower half as well.
    """
    n = len(windows)
    
    sqrt_w = np.sqrt(w)
    vals = np.multiply(x[2], sqrt_w.T)
    
    vals = ls.l1_norm(np.asarray([vals for x in windows]))
    vals = vals.real.astype(np.float64)

    if fastmath:
        comparison = calc_dists_fastmath(n, vals, np.asarray([x[3] for x in windows]))
    else:
        comparison = calc_dists(n, vals, np.asarray([x[3] for x in windows]))

    # Remove negatives... Can't be placed within Numba code
    comparison[comparison < 0] = 0

    # Get square root
    return np.sqrt(comparison)

In [ ]: