import numpy as np
np.random.seed(42)
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%load_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.ld
%aimport anhima.loc
Simulate genotypes with some degree of linkage disequilibrium.
n_variants = 1000
n_samples = 100
correlation = .97
g = anhima.sim.simulate_genotypes_with_ld(n_variants, n_samples, correlation)
g
array([[2, 0, 2, ..., 2, 0, 0], [0, 2, 0, ..., 0, 2, 2], [0, 2, 0, ..., 0, 2, 2], ..., [0, 2, 1, ..., 0, 1, 2], [0, 2, 1, ..., 0, 1, 2], [0, 2, 1, ..., 0, 1, 2]], dtype=int8)
Simulate the genomic physical positions of the variants.
pos = np.random.randint(low=0, high=n_variants*100, size=n_variants)
pos.sort()
Calculate an approximate value of LD between all pairs of variants.
r_squared = anhima.ld.pairwise_genotype_ld(g)
r_squared
array([[ 1.00000000e+00, 9.07748779e-01, 8.18691153e-01, ..., 1.49118959e-04, 3.89208802e-04, 3.57366211e-04], [ 9.07748779e-01, 1.00000000e+00, 9.06034190e-01, ..., 1.64228306e-03, 5.33172702e-03, 4.96930122e-03], [ 8.18691153e-01, 9.06034190e-01, 1.00000000e+00, ..., 4.79109120e-03, 1.04433734e-02, 6.92715218e-03], ..., [ 1.49118959e-04, 1.64228306e-03, 4.79109120e-03, ..., 1.00000000e+00, 9.18831940e-01, 8.75647397e-01], [ 3.89208802e-04, 5.33172702e-03, 1.04433734e-02, ..., 9.18831940e-01, 1.00000000e+00, 9.52542895e-01], [ 3.57366211e-04, 4.96930122e-03, 6.92715218e-03, ..., 8.75647397e-01, 9.52542895e-01, 1.00000000e+00]])
anhima.ld.plot_pairwise_ld(r_squared);
# zoom in to first 100 variants
anhima.ld.plot_pairwise_ld(r_squared[:100, :100]);
cor, sep, dist = anhima.ld.pairwise_ld_decay(r_squared, pos)
cor, sep, dist
(array([ 0.90774878, 0.81869115, 0.79309105, ..., 0.91883194, 0.8756474 , 0.95254289]), array([1, 2, 3, ..., 1, 2, 1]), array([ 59, 245, 383, ..., 295, 437, 142]))
plt.hist(cor[sep == 1]);
plt.hist(cor[(dist > 0) & (dist < 200)]);
np.sqrt(np.mean(cor[sep == 1]))
0.97025821817633195
anhima.ld.plot_ld_decay_by_separation(cor, sep);
anhima.ld.plot_ld_decay_by_distance(cor, dist, bins=np.arange(0, 10000, 200));
anhima.ld.plot_windowed_ld(g, pos, window_size=10000);
anhima.ld.plot_windowed_ld(g, pos, window_size=1000);
included = anhima.ld.ld_prune_pairwise(g)
Indices of the variants that have been retained.
np.nonzero(included)[0]
array([ 0, 37, 80, 102, 125, 146, 172, 194, 219, 249, 283, 306, 333, 353, 372, 397, 426, 446, 475, 500, 525, 555, 580, 609, 634, 661, 697, 723, 754, 796, 815, 847, 872, 896, 913, 943, 987])
g_pruned = np.compress(included, g, axis=0)
g_pruned
array([[2, 0, 2, ..., 2, 0, 0], [0, 0, 0, ..., 1, 2, 0], [0, 1, 2, ..., 1, 1, 1], ..., [2, 1, 0, ..., 0, 2, 2], [2, 0, 1, ..., 1, 0, 1], [0, 0, 0, ..., 2, 1, 0]], dtype=int8)
r_squared_pruned = anhima.ld.pairwise_genotype_ld(g_pruned)
anhima.ld.plot_pairwise_ld(r_squared_pruned);
pos_pruned = pos[included]
cor_pruned, sep_pruned, dist_pruned = anhima.ld.pairwise_ld_decay(r_squared_pruned, pos_pruned)
anhima.ld.plot_ld_decay_by_separation(cor_pruned, sep_pruned);
anhima.ld.plot_ld_decay_by_distance(cor_pruned, dist_pruned, bins=np.arange(0, 10000, 400));
n_variants_big = 100000
g_big = anhima.sim.simulate_genotypes_with_ld(n_variants_big, n_samples, correlation=.97)
pos_big = np.random.randint(low=0, high=n_variants_big*100, size=n_variants_big)
pos_big.sort()
included_big = anhima.ld.ld_prune_pairwise(g_big)
np.count_nonzero(included_big)
3706
g_big_pruned = np.compress(included_big, g_big, axis=0)
r_squared_big_pruned = anhima.ld.pairwise_genotype_ld(g_big_pruned)
anhima.ld.plot_pairwise_ld(r_squared_big_pruned[:100, :100]);
cor_big, sep_big, dist_big = anhima.ld.windowed_ld_decay(g_big, pos_big, window_size=1000, step=10)
anhima.ld.plot_ld_decay_by_separation(cor_big, sep_big, max_separation=100);
anhima.ld.plot_ld_decay_by_distance(cor_big, dist_big, bins=np.arange(0, 10000, 200));
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=100000);
anhima.ld.plot_windowed_ld(g_big, pos_big, window_size=10000);