#!/usr/bin/env python # coding: utf-8 # ## Calculate Fst and other popgen statistics # In[1]: import ipyrad.analysis as ipa import numpy as np import pandas as pd import itertools # In[2]: # get a sequence alignment from the HDF5 # In[3]: database = "/home/deren/Downloads/spp30.seqs.hdf5" idx=0 wmin=0 wmax=100000 # In[ ]: parse_phystring # In[12]: # init a popgen analysis object pop = ipa.popgen( name="ped", workdir="analysis-popgen", data="./pedicularis/clust85_min12_outfiles/clust85_min12.loci", ) # In[11]: pop._fst() # ## Update `loci_to_arr()` # # ... Now I'm thinking maybe I really should make a HDF5 format as the default for downstream analyses..., it can have SNPs, mapfile, and whole sequences with locimap. # In[ ]: def _loci_to_arr(self): """ return a frequency array from a loci file for all loci with taxa from taxdict and min coverage from mindict. """ # loci = infile.read().strip().split("|\n") ## make the array (4 or 5) and a mask array to remove loci without cov nloci = len(loci) keep = np.zeros(nloci, dtype=np.bool_) arr = np.zeros((nloci, 4, 300), dtype=np.float64) ## six rows b/c one for each p3, and for the fused p3 ancestor if len(taxdict) == 5: arr = np.zeros((nloci, 6, 300), dtype=np.float64) ## if not mindict, make one that requires 1 in each taxon if isinstance(mindict, int): mindict = {i: mindict for i in taxdict} elif isinstance(mindict, dict): mindict = {i: mindict[i] for i in taxdict} else: mindict = {i: 1 for i in taxdict} ## raise error if names are not 'p[int]' allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5'] if any([i not in allowed_names for i in taxdict]): raise IPyradError(\ "keys in taxdict must be named 'p1' through 'p4' or 'p5'") ## parse key names keys = sorted([i for i in taxdict.keys() if i[0] == 'p']) outg = keys[-1] ## grab seqs just for the good guys for loc in xrange(nloci): ## parse the locus lines = loci[loc].split("\n")[:-1] names = [i.split()[0] for i in lines] seqs = np.array([list(i.split()[1]) for i in lines]) ## check that names cover the taxdict (still need to check by site) covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \ for tax in taxdict] ## keep locus if all(covs): keep[loc] = True ## get the refseq refidx = np.where([i in taxdict[outg] for i in names])[0] refseq = seqs[refidx].view(np.uint8) ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]]) ## freq of ref in outgroup iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter 4-taxon freqs if len(taxdict) == 4: for tidx, key in enumerate(keys[:-1]): ## get idx of names in test tax nidx = np.where([i in taxdict[key] for i in names])[0] sidx = seqs[nidx].view(np.uint8) ## get freq of sidx iseq = _reffreq2(ancestral, sidx, GETCONS2) ## fill it in arr[loc, tidx, :iseq.shape[1]] = iseq else: ## entere p5; and fill it in iseq = _reffreq2(ancestral, refseq, GETCONS2) arr[loc, -1, :iseq.shape[1]] = iseq ## enter p1 nidx = np.where([i in taxdict['p1'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 0, :iseq.shape[1]] = iseq ## enter p2 nidx = np.where([i in taxdict['p2'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 1, :iseq.shape[1]] = iseq ## enter p3 with p4 masked, and p4 with p3 masked nidx = np.where([i in taxdict['p3'] for i in names])[0] nidy = np.where([i in taxdict['p4'] for i in names])[0] sidx = seqs[nidx].view(np.uint8) sidy = seqs[nidy].view(np.uint8) xseq = _reffreq2(ancestral, sidx, GETCONS2) yseq = _reffreq2(ancestral, sidy, GETCONS2) mask3 = xseq != 0 mask4 = yseq != 0 xseq[mask4] = 0 yseq[mask3] = 0 arr[loc, 2, :xseq.shape[1]] = xseq arr[loc, 3, :yseq.shape[1]] = yseq ## enter p34 nidx = nidx.tolist() + nidy.tolist() sidx = seqs[nidx].view(np.uint8) iseq = _reffreq2(ancestral, sidx, GETCONS2) arr[loc, 4, :iseq.shape[1]] = iseq ## size-down array to the number of loci that have taxa for the test arr = arr[keep, :, :] ## size-down sites to arr = masknulls(arr) return arr, keep # In[ ]: # ### Calculations # # Fst can be calculated per SNP, or averaged over sliding windows of SNPs of some size if reference information is present. Or over loci if # In[309]: pop.results # In[527]: popdict = {'i': [2, 3], 'j': [2, 4]} mindict = {} (mindict if mindict else {j:1 for j in popdict}) # In[528]: testdata1 = np.array([ [0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1], ]) testdata2 = np.random.binomial(1, 0.01, (6, 100)) # In[532]: class Self: def __init__(self, data, popdict={}, mindict={}, mapfile=False): self.data = data self.popdict = popdict self.mindict = mindict self._parsedata() self._checkdicts() self._filterdata() def _parsedata(self): """ parse data as an ndarray, vcfile, or locifile and store in ... arrays, one with all sites, and mapfile... """ pass def _checkdicts(self): #assert len(popdict) > 2, "must enter at least two populations" self.mindict = (mindict if mindict else {j:1 for j in popdict}) def _filterdata(self): pass def fst_H(self, pop1idx, pop2idx): ii = list(itertools.combinations(pop1idx, 2)) jj = list(itertools.combinations(pop2idx, 2)) kk = itertools.combinations(pop1idx + pop2idx, 2) between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk) diff = [self.data[i] != self.data[j] for (i, j) in ii + jj] sums = np.sum(diff, axis=0) a = sums / sums.shape[0] diff = [self.data[i] != self.data[j] for (i, j) in between] sums = np.sum(diff, axis=0) b = sums / sums.shape[0] return abs(1 - (a / b)) def theta_W(self): "return Watternson's theta for each population" df = pd.DataFrame( data=np.zeros(len(self.popdict)), columns=["theta"], index=list(self.popdict.keys()) ) for row in self.popdict: idat = self.data[self.popdict[row], :] segregating = np.invert(np.all(idat == idat[0], axis=0)).sum() denom = np.sum((1 / i for i in range(1, idat.shape[1] - 1))) theta = segregating / denom df.loc[row] = theta return df def pi(self, pop1idx, pop2idx): pass # In[533]: popdict = {'A': [0, 1], 'B': [2, 3, 4, 5]} mindict = {'A': 1, 'B': 2} self = Self(testdata2, popdict, mindict) # In[535]: self.theta_W() # In[537]: print(self.data) print("\nFst per SNP:", self.fst_H([0, 1], [2, 3, 4, 5])) print("\nFst mean:", np.mean(self.fst_H([0, 1], [2, 3, 4, 5]))) print("\ntheta per individual:", self.theta_W([0, 1], [2, 3, 4, 5])) print("\ntheta per population:", self.theta_W([0, 1], [2, 3, 4, 5])) # In[555]: pd.DataFrame( data=np.zeros((2, 2)), index=['aaaaaa', 'bbbbbbbbbbb'], ) # In[242]: self.fst([0, 1], [2, 3, 4, 5]) # In[235]: pop1idx = [0, 1] pop2idx = [2, 3, 4, 5] pop1 = self.data[pop1idx, :] pop2 = self.data[pop2idx, :] popa = self.data[pop1idx + pop2idx, :] # In[74]: ilist = itertools.combinations(range(4), 2) list(ilist) # In[75]: ilist = itertools.combinations(range(pop2.shape[0]), 2) list(ilist) # In[80]: def diffs(pop): ilist = itertools.combinations(range(pop.shape[0]), 2) diff = [pop[i] == pop[j] for (i, j) in ilist] sums = np.sum(diff, axis=0) return sums/sums.shape[0] # In[231]: def fst(pop1idx, pop2idx): ii = list(itertools.combinations(pop1idx, 2)) jj = list(itertools.combinations(pop2idx, 2)) kk = itertools.combinations(pop1idx + pop2idx, 2) between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk) diff = [self.data[i] != self.data[j] for (i, j) in ii + jj] sums = np.sum(diff, axis=0) a = sums / sums.shape[0] diff = [self.data[i] != self.data[j] for (i, j) in ii + jj] sums = np.sum(diff, axis=0) b = sums / sums.shape[0] return abs(1 - (a / b)) # In[175]: #fst([0, 1], [2, 3, 4, 5]) # In[245]: ii = list(itertools.combinations(pop1idx, 2)) jj = list(itertools.combinations(pop2idx, 2)) within = ii + jj kk = itertools.combinations(pop1idx + pop2idx, 2) between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk) # In[246]: ii + jj # In[247]: between # In[244]: self.data # In[209]: diff = [self.data[i] != self.data[j] for (i, j) in ii + jj] sums = np.sum(diff, axis=0) a = sums / sums.shape[0] a # In[243]: abs(1 - (a / b)) # In[210]: diff = [self.data[i] != self.data[j] for (i, j) in between] sums = np.sum(diff, axis=0) b = sums / sums.shape[0] # In[195]: 1 - (a / b) # In[196]: self.data # In[171]: between # In[79]: diffs(pop2) / diffs(popa) # In[14]: for i in range(pop1.shape[0]): for j in range(pop1.shape[0]): print(i,j) # In[123]: def _fstH(self, pop1idx, pop2idx): "Hudson's Fst estimator" pop1 = self.data[pop1idx, :] pop2 = self.data[pop2idx, :] pop1freq = np.mean(pop1, axis=0) pop2freq = np.mean(pop2, axis=0) a = (pop1freq - pop2freq) ** 2 b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1) c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1) return pd.Series(a - b - c).round(5) # In[141]: def _fstWC(self, pop1idx, pop2idx): "Wier and Cockerham (1984) Fst estimator" pop1 = self.data[pop1idx, :] pop2 = self.data[pop2idx, :] popa = self.data[pop1idx + pop2idx, :] pop1freq = np.mean(pop1, axis=0) pop2freq = np.mean(pop2, axis=0) popafreq = np.mean(popa, axis=0) nbar = pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2. s2a = ( (1. / ((popa.shape[0] - 1) * nbar)) * ((pop1.shape[0] * 1)) ) nc = ( (1. / (npops - 1.)) * (popa.shape[0] - ((pop1.shape[0] ** 2 + pop2.shape[0] ** 2) / popa.shape[0]) ) ) #t1 = #t2 = return t1, t2 # In[ ]: def fstWC(self, pop1idx, pop2idx): pop1 = self.data[pop1idx, :] pop2 = self.data[pop2idx, :] popa = self.data[pop1idx + pop2idx, :] pop1freq = np.mean(pop1, axis=0) pop2freq = np.mean(pop2, axis=0) popafreq = np.mean(popa, axis=0) # frequecy of allele A in the sample of size ni from population i p = ... # observed proportion of individuals heterozygous for allele A hbar = # the average sample size nbar = nc = # In[138]: pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2. # In[140]: pop1.shape[0] / 2. + pop2.shape[0] / 2. # In[ ]: npops= 2.0 nsamples = float(NpopA + NpopB) n_bar= (NpopA / npops) + (NpopB / npops) samplefreq = ( (popAcount+popBcount) / (NpopA + NpopB) ) pop1freq = popAcount / float(NpopA ) pop2freq = popBcount / float(NpopB ) Npop1 = NpopA Npop2 = NpopB S2A= (1/ ( (npops-1.0) * n_bar) ) * ( ( (Npop1)* ((pop1freq-samplefreq)**2) ) + ( (Npop2)*((pop2freq-samplefreq)**2) ) ) nc = 1.0/(npops-1.0) * ( (Npop1+Npop2) - (((Npop1**2)+(Npop2**2)) / (Npop1+Npop2)) ) T_1 = S2A -( ( 1/(n_bar-1) ) * ( (samplefreq * (1-samplefreq)) - ((npops-1)/npops)* S2A ) ) T_2 = (( (nc-1) / (n_bar-1) ) * samplefreq *(1-samplefreq) ) + (1.0 + (((npops-1)*(n_bar-nc)) / (n_bar-1))) * (S2A/npops) # In[119]: pop1idx = [0, 1] pop2idx = [2, 3, 4] _fst(self, pop1idx, pop2idx) # In[120]: pop1idx = [0, 4] pop2idx = [1, 2, 3] _fst(self, pop1idx, pop2idx) # In[22]: pop1 = self.data[pop1idx, :] pop2 = self.data[pop2idx, :] popa = self.data[pop1idx + pop2idx, :] print(pop1) print(pop2) # In[104]: pop1freq = np.mean(pop1, axis=0) pop2freq = np.mean(pop2, axis=0) popafreq = np.mean(popa, axis=0) # In[105]: a = (pop1freq - pop2freq) ** 2 a # In[106]: b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1) b # In[107]: c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1) c # In[108]: pd.Series(a - b - c).round(3) # In[98]: pd.Series(a - b - c).round(3) # In[114]: d = (pop1freq * (1.0 - pop2freq)) + (pop1freq * (1.0 - pop2freq)) d # In[151]: (np.mean(popa, axis=0) - np.mean(pop2, axis=0)) / (np.mean(popa, axis=0)) # In[152]: popa # In[153]: np.count_nonzero(pop2, axis=0) # In[169]: a = np.mean(popa, axis=0) mask = a > 0.5 a[a > 0.5] = abs(a[a > 0.5] - 1) a # In[172]: b = np.mean(pop2, axis=0) mask = b > 0.5 b[b > 0.5] = abs(b[b > 0.5] - 1) b # In[171]: (a - b) / a # In[ ]: