## Calculate Fst and other popgen statistics¶

In [1]:
import ipyrad.analysis as ipa
import numpy as np
import pandas as pd
import itertools

In [2]:
# get a sequence alignment from the HDF5

In [3]:
database = "/home/deren/Downloads/spp30.seqs.hdf5"
idx=0
wmin=0
wmax=100000

In [ ]:
parse_phystring

In [12]:
# init a popgen analysis object
pop = ipa.popgen(
name="ped",
workdir="analysis-popgen",
data="./pedicularis/clust85_min12_outfiles/clust85_min12.loci",
)

In [11]:
pop._fst()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-11-0dc99510b3b6> in <module>()
----> 1 pop._fst()

114         )
115         d = self.npops
--> 116
117         # iterate over pairs of pops and fill Fst values
118         pairs = itertools.combinations(self.popdict.key(), 2)

AttributeError: 'dict' object has no attribute 'key'

## Update loci_to_arr()¶

... Now I'm thinking maybe I really should make a HDF5 format as the default for downstream analyses..., it can have SNPs, mapfile, and whole sequences with locimap.

In [ ]:
def _loci_to_arr(self):
"""
return a frequency array from a loci file for all loci with taxa from
taxdict and min coverage from mindict.
"""

#

## make the array (4 or 5) and a mask array to remove loci without cov
nloci = len(loci)
keep = np.zeros(nloci, dtype=np.bool_)
arr = np.zeros((nloci, 4, 300), dtype=np.float64)

## six rows b/c one for each p3, and for the fused p3 ancestor
if len(taxdict) == 5:
arr = np.zeros((nloci, 6, 300), dtype=np.float64)

## if not mindict, make one that requires 1 in each taxon
if isinstance(mindict, int):
mindict = {i: mindict for i in taxdict}
elif isinstance(mindict, dict):
mindict = {i: mindict[i] for i in taxdict}
else:
mindict = {i: 1 for i in taxdict}

## raise error if names are not 'p[int]'
allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
if any([i not in allowed_names for i in taxdict]):
"keys in taxdict must be named 'p1' through 'p4' or 'p5'")

## parse key names
keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
outg = keys[-1]

## grab seqs just for the good guys
for loc in xrange(nloci):

## parse the locus
lines = loci[loc].split("\n")[:-1]
names = [i.split()[0] for i in lines]
seqs = np.array([list(i.split()[1]) for i in lines])

## check that names cover the taxdict (still need to check by site)
covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
for tax in taxdict]

## keep locus
if all(covs):
keep[loc] = True

## get the refseq
refidx = np.where([i in taxdict[outg] for i in names])[0]
refseq = seqs[refidx].view(np.uint8)
ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])

## freq of ref in outgroup
iseq = _reffreq2(ancestral, refseq, GETCONS2)
arr[loc, -1, :iseq.shape[1]] = iseq

## enter 4-taxon freqs
if len(taxdict) == 4:
for tidx, key in enumerate(keys[:-1]):

## get idx of names in test tax
nidx = np.where([i in taxdict[key] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)

## get freq of sidx
iseq = _reffreq2(ancestral, sidx, GETCONS2)

## fill it in
arr[loc, tidx, :iseq.shape[1]] = iseq

else:

## entere p5; and fill it in
iseq = _reffreq2(ancestral, refseq, GETCONS2)
arr[loc, -1, :iseq.shape[1]] = iseq

## enter p1
nidx = np.where([i in taxdict['p1'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 0, :iseq.shape[1]] = iseq

## enter p2
nidx = np.where([i in taxdict['p2'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 1, :iseq.shape[1]] = iseq

nidx = np.where([i in taxdict['p3'] for i in names])[0]
nidy = np.where([i in taxdict['p4'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
sidy = seqs[nidy].view(np.uint8)
xseq = _reffreq2(ancestral, sidx, GETCONS2)
yseq = _reffreq2(ancestral, sidy, GETCONS2)
arr[loc, 2, :xseq.shape[1]] = xseq
arr[loc, 3, :yseq.shape[1]] = yseq

## enter p34
nidx = nidx.tolist() + nidy.tolist()
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 4, :iseq.shape[1]] = iseq

## size-down array to the number of loci that have taxa for the test
arr = arr[keep, :, :]

## size-down sites to

return arr, keep

In [ ]:



### Calculations¶

Fst can be calculated per SNP, or averaged over sliding windows of SNPs of some size if reference information is present. Or over loci if

In [309]:
pop.results

Out[309]:
fst     Empty DataFrame
Columns: []
Index: []
pi      Empty DataFrame
Columns: [0]
Index: []
theta   Empty DataFrame
Columns: [0]
Index: []
In [527]:
popdict = {'i': [2, 3], 'j': [2, 4]}
mindict = {}
(mindict if mindict else {j:1 for j in popdict})

Out[527]:
{'i': 1, 'j': 1}
In [528]:
testdata1 = np.array([
[0, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 1],
])

testdata2 = np.random.binomial(1, 0.01, (6, 100))

In [532]:
class Self:
def __init__(self, data, popdict={}, mindict={}, mapfile=False):
self.data = data
self.popdict = popdict
self.mindict = mindict
self._parsedata()
self._checkdicts()
self._filterdata()

def _parsedata(self):
"""
parse data as an ndarray, vcfile, or locifile and store
in ... arrays, one with all sites, and mapfile...
"""
pass

def _checkdicts(self):
#assert len(popdict) > 2, "must enter at least two populations"
self.mindict = (mindict if mindict else {j:1 for j in popdict})

def _filterdata(self):
pass

def fst_H(self, pop1idx, pop2idx):
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)

diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]

diff = [self.data[i] != self.data[j] for (i, j) in between]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]
return abs(1 - (a / b))

def theta_W(self):
"return Watternson's theta for each population"
df = pd.DataFrame(
data=np.zeros(len(self.popdict)),
columns=["theta"],
index=list(self.popdict.keys())
)
for row in self.popdict:
idat = self.data[self.popdict[row], :]
segregating = np.invert(np.all(idat == idat[0], axis=0)).sum()
denom = np.sum((1 / i for i in range(1, idat.shape[1] - 1)))
theta = segregating / denom
df.loc[row] = theta
return df

def pi(self, pop1idx, pop2idx):
pass

In [533]:
popdict = {'A': [0, 1], 'B': [2, 3, 4, 5]}
mindict = {'A': 1, 'B': 2}
self = Self(testdata2, popdict, mindict)

In [535]:
self.theta_W()

Out[535]:
theta
A 0.387
B 0.581
In [537]:
print(self.data)
print("\nFst per SNP:", self.fst_H([0, 1], [2, 3, 4, 5]))
print("\nFst mean:", np.mean(self.fst_H([0, 1], [2, 3, 4, 5])))
print("\ntheta per individual:", self.theta_W([0, 1], [2, 3, 4, 5]))
print("\ntheta per population:", self.theta_W([0, 1], [2, 3, 4, 5]))

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

Fst per SNP: [ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
nan  nan 0.75  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
nan  nan  nan  nan  nan  nan  nan  nan  nan 0.5   nan  nan  nan  nan
nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan
nan  nan  nan  nan 0.5   nan  nan  nan  nan  nan  nan  nan  nan  nan
nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan 0.75
nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan 0.5
nan  nan]

Fst mean: nan

/home/deren/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:40: RuntimeWarning: invalid value encountered in true_divide

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-537-e9d9e53cd653> in <module>()
2 print("\nFst per SNP:", self.fst_H([0, 1], [2, 3, 4, 5]))
3 print("\nFst mean:", np.mean(self.fst_H([0, 1], [2, 3, 4, 5])))
----> 4 print("\ntheta per individual:", self.theta_W([0, 1], [2, 3, 4, 5]))
5 print("\ntheta per population:", self.theta_W([0, 1], [2, 3, 4, 5]))

TypeError: theta_W() takes 1 positional argument but 3 were given
In [555]:
pd.DataFrame(
data=np.zeros((2, 2)),
index=['aaaaaa', 'bbbbbbbbbbb'],
)

Out[555]:
0 1
aaaaaa 0.0 0.0
bbbbbbbbbbb 0.0 0.0
In [242]:
self.fst([0, 1], [2, 3, 4, 5])

Out[242]:
(array([0.2, 0. , 0. , 0.6, 0.6]), array([0.2, 0. , 0. , 0.6, 0.6]))
In [235]:
pop1idx = [0, 1]
pop2idx = [2, 3, 4, 5]
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]

In [74]:
ilist = itertools.combinations(range(4), 2)
list(ilist)

Out[74]:
[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
In [75]:
ilist = itertools.combinations(range(pop2.shape[0]), 2)
list(ilist)

Out[75]:
[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
In [80]:
def diffs(pop):
ilist = itertools.combinations(range(pop.shape[0]), 2)
diff = [pop[i] == pop[j] for (i, j) in ilist]
sums = np.sum(diff, axis=0)
return sums/sums.shape[0]

In [231]:
def fst(pop1idx, pop2idx):
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)

diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]

diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]

return abs(1 - (a / b))

In [175]:
#fst([0, 1], [2, 3, 4, 5])

In [245]:
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
within = ii + jj
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)

In [246]:
ii + jj

Out[246]:
[(0, 1), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)]
In [247]:
between

Out[247]:
<itertools.filterfalse at 0x7faa876c4eb8>
In [244]:
self.data

Out[244]:
array([[0, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 1]])
In [209]:
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]

a

Out[209]:
array([0.2, 0. , 0. , 0.6, 0.6])
In [243]:
abs(1 - (a / b))

Out[243]:
array([0.75, 1.  , 1.  , 0.5 , 0.5 ])
In [210]:
diff = [self.data[i] != self.data[j] for (i, j) in between]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]

In [195]:
1 - (a / b)

Out[195]:
array([ 0.75,  1.  ,  1.  ,  0.5 , -0.5 ])
In [196]:
self.data

Out[196]:
array([[0, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 1]])
In [171]:
between

Out[171]:
[(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (1, 5)]
In [79]:
diffs(pop2) / diffs(popa)

Out[79]:
array([0.6       , 0.85714286, 0.85714286, 0.5       , 0.3       ])
In [14]:
for i in range(pop1.shape[0]):
for j in range(pop1.shape[0]):
print(i,j)

0 0
0 1
1 0
1 1

In [123]:
def _fstH(self, pop1idx, pop2idx):
"Hudson's Fst estimator"
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
a = (pop1freq - pop2freq) ** 2
b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
return pd.Series(a - b - c).round(5)

In [141]:
def _fstWC(self, pop1idx, pop2idx):
"Wier and Cockerham (1984) Fst estimator"
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)
nbar = pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.
s2a = (
(1. / ((popa.shape[0] - 1) * nbar))
* ((pop1.shape[0] * 1))
)
nc = (
(1. / (npops - 1.))
* (popa.shape[0]
- ((pop1.shape[0] ** 2 + pop2.shape[0] ** 2) / popa.shape[0])
)
)
#t1 =
#t2 =
return t1, t2

In [ ]:
def fstWC(self, pop1idx, pop2idx):
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]

pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)

# frequecy of allele A in the sample of size ni from population i
p = ...
# observed proportion of individuals heterozygous for allele A
hbar =
# the average sample size
nbar =
nc =

In [138]:
pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.

Out[138]:
array([2. , 1.5, 1.5, 1. , 0.5])
In [140]:
pop1.shape[0] / 2. + pop2.shape[0] / 2.

Out[140]:
2.5
In [ ]:


npops= 2.0
nsamples = float(NpopA + NpopB)
n_bar= (NpopA / npops) + (NpopB / npops)
samplefreq = ( (popAcount+popBcount) / (NpopA + NpopB) )
pop1freq = popAcount / float(NpopA )
pop2freq = popBcount / float(NpopB )
Npop1 = NpopA
Npop2 = NpopB
S2A= (1/ ( (npops-1.0) * n_bar) ) *
( ( (Npop1)* ((pop1freq-samplefreq)**2) ) +
( (Npop2)*((pop2freq-samplefreq)**2) ) )
nc = 1.0/(npops-1.0) * ( (Npop1+Npop2) - (((Npop1**2)+(Npop2**2)) / (Npop1+Npop2)) )
T_1 = S2A -( ( 1/(n_bar-1) ) * ( (samplefreq * (1-samplefreq)) -  ((npops-1)/npops)* S2A ) )
T_2 = (( (nc-1) / (n_bar-1) ) * samplefreq *(1-samplefreq) )   +  (1.0 +   (((npops-1)*(n_bar-nc))  / (n_bar-1)))       * (S2A/npops)

In [119]:
pop1idx = [0, 1]
pop2idx = [2, 3, 4]
_fst(self, pop1idx, pop2idx)

Out[119]:
0    0.000
1    1.000
2    1.000
3    0.333
4   -0.000
dtype: float64
In [120]:
pop1idx = [0, 4]
pop2idx = [1, 2, 3]
_fst(self, pop1idx, pop2idx)

Out[120]:
0    0.000
1   -0.333
2   -0.333
3   -0.333
4    0.000
dtype: float64
In [22]:
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]

print(pop1)
print(pop2)

[[0 0 0 0 0]
[1 0 0 0 0]]
[[1 1 1 0 0]
[1 1 1 1 0]
[1 1 1 1 1]]

In [104]:
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)

In [105]:
a = (pop1freq - pop2freq) ** 2
a

Out[105]:
array([0.25      , 1.        , 1.        , 0.44444444, 0.11111111])
In [106]:
b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
b

Out[106]:
array([0.25, 0.  , 0.  , 0.  , 0.  ])
In [107]:
c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
c

Out[107]:
array([0.        , 0.        , 0.        , 0.11111111, 0.11111111])
In [108]:
pd.Series(a - b - c).round(3)

Out[108]:
0    0.000
1    1.000
2    1.000
3    0.333
4   -0.000
dtype: float64
In [98]:
pd.Series(a - b - c).round(3)

Out[98]:
0    0.000
1    1.000
2    1.000
3    0.333
4   -0.000
dtype: float64
In [114]:
d = (pop1freq * (1.0 - pop2freq)) + (pop1freq * (1.0 - pop2freq))
d

Out[114]:
array([0., 0., 0., 0., 0.])
In [151]:
(np.mean(popa, axis=0) - np.mean(pop2, axis=0)) / (np.mean(popa, axis=0))

Out[151]:
array([-0.25      , -0.66666667, -0.66666667, -0.66666667, -0.66666667])
In [152]:
popa

Out[152]:
array([[0, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 1]])
In [153]:
np.count_nonzero(pop2, axis=0)

Out[153]:
array([3, 3, 3, 2, 1])
In [169]:
a = np.mean(popa, axis=0)
a[a > 0.5] = abs(a[a > 0.5] - 1)
a

Out[169]:
array([0.2, 0.4, 0.4, 0.4, 0.2])
In [172]:
b = np.mean(pop2, axis=0)

array([0.        , 0.        , 0.        , 0.33333333, 0.33333333])
(a - b) / a

array([ 1.        ,  1.        ,  1.        ,  0.16666667, -0.66666667])