anhima.ped
- Pedigrees¶import numpy as np
np.random.seed(1)
import scipy.stats
import itertools
import random
random.seed(42)
import matplotlib.pyplot as plt
%matplotlib inline
import sys
# import anhima
# dev imports
sys.path.insert(0, '..')
%reload_ext autoreload
%autoreload 1
%aimport anhima.sim
%aimport anhima.util
%aimport anhima.gt
%aimport anhima.ped
# simulate parent diplotype
n_variants = 1000
p_alt = .5
parent_diplotype_truth = scipy.stats.bernoulli.rvs(p_alt, size=1000*2).reshape(1000, 2)
# introduce some missingness
p_missing = .03
n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
loc_missing = random.sample(range(n_variants), n_missing)
parent_diplotype = parent_diplotype_truth.copy()
parent_diplotype[loc_missing] = (-1, -1)
parent_diplotype
array([[0, 1], [0, 0], [0, 0], ..., [1, 0], [0, 1], [0, 1]])
anhima.gt.count_called(parent_diplotype)
969
anhima.gt.count_hom_ref(parent_diplotype)
214
anhima.gt.count_het(parent_diplotype)
502
anhima.gt.count_hom_alt(parent_diplotype)
253
# simulate gamete haplotypes
n_gametes = 20
gamete_haplotypes = np.empty((n_variants, n_gametes), dtype='i1')
n_crossovers = scipy.stats.poisson.rvs(.8, size=n_gametes)
p_mendel_error = .03
for i in range(n_gametes):
# randomly choose which parent to start with
parent = scipy.stats.bernoulli(.5).rvs()
h = parent_diplotype_truth[:, parent].copy()
# simulate crossovers
loc_switches = sorted(np.random.randint(0, n_variants, size=n_crossovers[i]))
for l in loc_switches:
parent = 0 if parent == 1 else 1
h[l:] = parent_diplotype_truth[l:, parent]
# simulate errors
n_me = scipy.stats.binom(p=p_mendel_error, n=n_variants).rvs()
loc_me = random.sample(range(n_variants), n_me)
h[loc_me] = scipy.stats.bernoulli.rvs(.5, size=n_me)
# simulate missingness
n_missing = scipy.stats.binom(p=p_missing, n=n_variants).rvs()
loc_missing = random.sample(range(n_variants), n_missing)
h[loc_missing] = -1
gamete_haplotypes[:, i] = h
gamete_haplotypes
array([[ 0, 0, 0, ..., 1, 0, 1], [ 0, 0, 0, ..., 0, 0, 0], [ 0, 0, 0, ..., 0, 0, 0], ..., [ 1, 1, 1, ..., 1, 1, -1], [ 0, 0, 0, ..., 0, 0, 0], [ 0, 0, 0, ..., 0, 0, 0]], dtype=int8)
inh = anhima.ped.diploid_inheritance(parent_diplotype, gamete_haplotypes)
inh
array([[1, 1, 1, ..., 2, 1, 2], [3, 3, 3, ..., 3, 3, 3], [3, 3, 3, ..., 3, 3, 3], ..., [1, 1, 1, ..., 1, 1, 7], [1, 1, 1, ..., 1, 1, 1], [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)
inheritance_colors = ['red', # parent 1
'blue', # parent 2
'green', # parents both ref
'orange', # parents both alt
'black', # non-parental
'yellow', # parents missing
'white'] # missing
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh,
colors=inheritance_colors,
states=range(1, 8),
ax=ax);
inh_seg = inh[anhima.gt.is_het(parent_diplotype), :]
inh_seg.shape
(502, 20)
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg,
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));
# simulate random variant positions
pos = np.asarray(random.sample(range(100000), inh_seg.shape[0]))
pos.sort()
pos_impute = [0] + random.sample(range(100000), 100) + [200000]
pos_impute = np.array(pos_impute)
pos_impute.sort()
pos_impute
array([ 0, 371, 1947, 2068, 2799, 3379, 4082, 4288, 5683, 6629, 6868, 7936, 8330, 8992, 12015, 12978, 13022, 14293, 14408, 14896, 16011, 16505, 16614, 17038, 17275, 18964, 19340, 20253, 22838, 22871, 24878, 26896, 27152, 27801, 28305, 28477, 29038, 30539, 30947, 31186, 31925, 32853, 34782, 36160, 36342, 36666, 39535, 42437, 43064, 43713, 46955, 47012, 47453, 47537, 48147, 48566, 52184, 53492, 54379, 54935, 55176, 58115, 58287, 58756, 59245, 59600, 60036, 62422, 64139, 66039, 67271, 68347, 68955, 70804, 70807, 71364, 72641, 74000, 76858, 76983, 77182, 77812, 78116, 78269, 80623, 80694, 83819, 84109, 84323, 84518, 85841, 86363, 86646, 86870, 89873, 90476, 95236, 95321, 97218, 97290, 97735, 200000])
inh_imputed = anhima.ped.impute_inheritance_nearest(inh_seg, pos, pos_impute)
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_imputed,
colors=inheritance_colors,
states=range(8),
ax=ax);
# consider first gamete only
x = inh_seg[:, 0]
fig, ax = plt.subplots(figsize=(14, 1))
anhima.gt.plot_discrete_calldata(x[:, None],
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, x.size, 50));
states = anhima.ped.INHERIT_PARENT1, anhima.ped.INHERIT_PARENT2
df_switches = anhima.util.tabulate_state_transitions(x, states, pos)
df_switches.head()
lidx | ridx | lpos | rpos | lval | rval | |
---|---|---|---|---|---|---|
0 | 2 | 3 | 350 | 1061 | 1 | 2 |
1 | 3 | 4 | 1061 | 1231 | 2 | 1 |
2 | 56 | 57 | 12735 | 12739 | 1 | 2 |
3 | 72 | 73 | 15292 | 15396 | 2 | 1 |
4 | 73 | 74 | 15396 | 15525 | 1 | 2 |
df_switches.tail()
lidx | ridx | lpos | rpos | lval | rval | |
---|---|---|---|---|---|---|
19 | 409 | 410 | 82578 | 82727 | 2 | 1 |
20 | 429 | 430 | 86400 | 86499 | 1 | 2 |
21 | 430 | 431 | 86499 | 86563 | 2 | 1 |
22 | 474 | 475 | 94203 | 94493 | 1 | 2 |
23 | 475 | 476 | 94493 | 94692 | 2 | 1 |
df_blocks = anhima.util.tabulate_state_blocks(x, states, pos)
df_blocks.head()
start_min_idx | start_max_idx | stop_min_idx | stop_max_idx | start_min_pos | start_max_pos | stop_min_pos | stop_max_pos | state | support | size_min | size_max | length_min | length_max | is_marginal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 2 | 3 | 150 | 150 | 350 | 1061 | 1 | 3 | 2 | 2 | 200 | 911 | True |
1 | 2 | 3 | 3 | 4 | 350 | 1061 | 1061 | 1231 | 2 | 1 | 1 | 1 | 0 | 881 | False |
2 | 3 | 4 | 56 | 57 | 1061 | 1231 | 12735 | 12739 | 1 | 52 | 53 | 53 | 11504 | 11678 | False |
3 | 56 | 57 | 72 | 73 | 12735 | 12739 | 15292 | 15396 | 2 | 16 | 16 | 16 | 2553 | 2661 | False |
4 | 72 | 73 | 73 | 74 | 15292 | 15396 | 15396 | 15525 | 1 | 1 | 1 | 1 | 0 | 233 | False |
df_blocks.tail()
start_min_idx | start_max_idx | stop_min_idx | stop_max_idx | start_min_pos | start_max_pos | stop_min_pos | stop_max_pos | state | support | size_min | size_max | length_min | length_max | is_marginal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
20 | 409 | 410 | 429 | 430 | 82578 | 82727 | 86400 | 86499 | 1 | 19 | 20 | 20 | 3673 | 3921 | False |
21 | 429 | 430 | 430 | 431 | 86400 | 86499 | 86499 | 86563 | 2 | 1 | 1 | 1 | 0 | 163 | False |
22 | 430 | 431 | 474 | 475 | 86499 | 86563 | 94203 | 94493 | 1 | 42 | 44 | 44 | 7640 | 7994 | False |
23 | 474 | 475 | 475 | 476 | 94203 | 94493 | 94493 | 94692 | 2 | 1 | 1 | 1 | 0 | 489 | False |
24 | 475 | 476 | 501 | 501 | 94493 | 94692 | 99991 | 99991 | 1 | 26 | 26 | 26 | 5299 | 5498 | True |
df_all_switches = anhima.ped.tabulate_inheritance_switches(inh_seg, pos)
df_all_switches.head()
lidx | ridx | lpos | rpos | lval | rval | ||
---|---|---|---|---|---|---|---|
0 | 0 | 2 | 3 | 350 | 1061 | 1 | 2 |
1 | 3 | 4 | 1061 | 1231 | 2 | 1 | |
2 | 56 | 57 | 12735 | 12739 | 1 | 2 | |
3 | 72 | 73 | 15292 | 15396 | 2 | 1 | |
4 | 73 | 74 | 15396 | 15525 | 1 | 2 |
df_switches.equals(df_all_switches.loc[0])
True
df_all_blocks = anhima.ped.tabulate_inheritance_blocks(inh_seg, pos)
df_all_blocks.head()
start_min_idx | start_max_idx | stop_min_idx | stop_max_idx | start_min_pos | start_max_pos | stop_min_pos | stop_max_pos | state | support | size_min | size_max | length_min | length_max | is_marginal | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 2 | 3 | 150 | 150 | 350 | 1061 | 1 | 3 | 2 | 2 | 200 | 911 | True |
1 | 2 | 3 | 3 | 4 | 350 | 1061 | 1061 | 1231 | 2 | 1 | 1 | 1 | 0 | 881 | False | |
2 | 3 | 4 | 56 | 57 | 1061 | 1231 | 12735 | 12739 | 1 | 52 | 53 | 53 | 11504 | 11678 | False | |
3 | 56 | 57 | 72 | 73 | 12735 | 12739 | 15292 | 15396 | 2 | 16 | 16 | 16 | 2553 | 2661 | False | |
4 | 72 | 73 | 73 | 74 | 15292 | 15396 | 15396 | 15525 | 1 | 1 | 1 | 1 | 0 | 233 | False |
df_blocks.equals(df_all_blocks.loc[0])
True
block_support, block_length_min = anhima.ped.inheritance_block_masks(inh_seg, pos)
# filter using block support
inh_seg_flt = inh_seg.copy()
inh_seg_flt[block_support < 2] = anhima.ped.INHERIT_MISSING
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg_flt,
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));
anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos)
start_min_idx | start_max_idx | stop_min_idx | stop_max_idx | start_min_pos | start_max_pos | stop_min_pos | stop_max_pos | state | support | size_min | size_max | length_min | length_max | is_marginal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 56 | 57 | 150 | 150 | 12735 | 12739 | 1 | 55 | 56 | 56 | 12585 | 12589 | True |
1 | 56 | 57 | 409 | 410 | 12735 | 12739 | 82578 | 82727 | 2 | 333 | 353 | 353 | 69839 | 69992 | False |
2 | 409 | 410 | 501 | 501 | 82578 | 82727 | 99991 | 99991 | 1 | 86 | 92 | 92 | 17264 | 17413 | True |
# filter using minimal block length
inh_seg_flt = inh_seg.copy()
inh_seg_flt[block_length_min < 100] = anhima.ped.INHERIT_MISSING
fig, ax = plt.subplots(figsize=(12, 8))
anhima.gt.plot_discrete_calldata(inh_seg_flt,
colors=inheritance_colors,
states=range(8),
ax=ax)
ax.set_xticks(np.arange(0, inh_seg.shape[0], 50));
anhima.util.tabulate_state_blocks(inh_seg_flt[:, 0], states, pos)
start_min_idx | start_max_idx | stop_min_idx | stop_max_idx | start_min_pos | start_max_pos | stop_min_pos | stop_max_pos | state | support | size_min | size_max | length_min | length_max | is_marginal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 56 | 57 | 150 | 150 | 12735 | 12739 | 1 | 55 | 56 | 56 | 12585 | 12589 | True |
1 | 56 | 57 | 406 | 410 | 12735 | 12739 | 82242 | 82727 | 2 | 331 | 350 | 353 | 69503 | 69992 | False |
2 | 406 | 410 | 501 | 501 | 82242 | 82727 | 99991 | 99991 | 1 | 86 | 92 | 95 | 17264 | 17749 | True |