#!/usr/bin/env python
# coding: utf-8

# ## Calculate Fst and other popgen statistics

# In[1]:


import ipyrad.analysis as ipa
import numpy as np
import pandas as pd
import itertools 


# In[2]:


# get a sequence alignment from the HDF5


# In[3]:


database = "/home/deren/Downloads/spp30.seqs.hdf5"
idx=0
wmin=0
wmax=100000


# In[ ]:


parse_phystring


# In[12]:


# init a popgen analysis object
pop = ipa.popgen(
    name="ped", 
    workdir="analysis-popgen", 
    data="./pedicularis/clust85_min12_outfiles/clust85_min12.loci",
)


# In[11]:


pop._fst()


# ## Update `loci_to_arr()`
# 
# ... Now I'm thinking maybe I really should make a HDF5 format as the default for downstream analyses..., it can have SNPs, mapfile, and whole sequences with locimap. 

# In[ ]:


def _loci_to_arr(self):
    """
    return a frequency array from a loci file for all loci with taxa from 
    taxdict and min coverage from mindict. 
    """

    # 
    loci = infile.read().strip().split("|\n")
    
    ## make the array (4 or 5) and a mask array to remove loci without cov
    nloci = len(loci)
    keep = np.zeros(nloci, dtype=np.bool_)
    arr = np.zeros((nloci, 4, 300), dtype=np.float64)

    ## six rows b/c one for each p3, and for the fused p3 ancestor
    if len(taxdict) == 5:
        arr = np.zeros((nloci, 6, 300), dtype=np.float64)

    ## if not mindict, make one that requires 1 in each taxon
    if isinstance(mindict, int):
        mindict = {i: mindict for i in taxdict}
    elif isinstance(mindict, dict):
        mindict = {i: mindict[i] for i in taxdict}
    else:
        mindict = {i: 1 for i in taxdict}

    ## raise error if names are not 'p[int]' 
    allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
    if any([i not in allowed_names for i in taxdict]):
        raise IPyradError(\
            "keys in taxdict must be named 'p1' through 'p4' or 'p5'")

    ## parse key names
    keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
    outg = keys[-1]

    ## grab seqs just for the good guys
    for loc in xrange(nloci):

        ## parse the locus
        lines = loci[loc].split("\n")[:-1]
        names = [i.split()[0] for i in lines]
        seqs = np.array([list(i.split()[1]) for i in lines])

        ## check that names cover the taxdict (still need to check by site)
        covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
                for tax in taxdict]

        ## keep locus
        if all(covs):
            keep[loc] = True

            ## get the refseq
            refidx = np.where([i in taxdict[outg] for i in names])[0]
            refseq = seqs[refidx].view(np.uint8)
            ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])

            ## freq of ref in outgroup
            iseq = _reffreq2(ancestral, refseq, GETCONS2)
            arr[loc, -1, :iseq.shape[1]] = iseq 

            ## enter 4-taxon freqs
            if len(taxdict) == 4:
                for tidx, key in enumerate(keys[:-1]):

                    ## get idx of names in test tax
                    nidx = np.where([i in taxdict[key] for i in names])[0]
                    sidx = seqs[nidx].view(np.uint8)
                   
                    ## get freq of sidx
                    iseq = _reffreq2(ancestral, sidx, GETCONS2)
                   
                    ## fill it in 
                    arr[loc, tidx, :iseq.shape[1]] = iseq

            else:

                ## entere p5; and fill it in
                iseq = _reffreq2(ancestral, refseq, GETCONS2) 
                arr[loc, -1, :iseq.shape[1]] = iseq 
                
                ## enter p1
                nidx = np.where([i in taxdict['p1'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 0, :iseq.shape[1]] = iseq
                
                ## enter p2
                nidx = np.where([i in taxdict['p2'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 1, :iseq.shape[1]] = iseq
                
                ## enter p3 with p4 masked, and p4 with p3 masked
                nidx = np.where([i in taxdict['p3'] for i in names])[0]
                nidy = np.where([i in taxdict['p4'] for i in names])[0]
                sidx = seqs[nidx].view(np.uint8)
                sidy = seqs[nidy].view(np.uint8)
                xseq = _reffreq2(ancestral, sidx, GETCONS2)
                yseq = _reffreq2(ancestral, sidy, GETCONS2)
                mask3 = xseq != 0
                mask4 = yseq != 0
                xseq[mask4] = 0
                yseq[mask3] = 0
                arr[loc, 2, :xseq.shape[1]] = xseq
                arr[loc, 3, :yseq.shape[1]] = yseq
                
                ## enter p34 
                nidx = nidx.tolist() + nidy.tolist()
                sidx = seqs[nidx].view(np.uint8)
                iseq = _reffreq2(ancestral, sidx, GETCONS2)
                arr[loc, 4, :iseq.shape[1]] = iseq


    ## size-down array to the number of loci that have taxa for the test
    arr = arr[keep, :, :]

    ## size-down sites to 
    arr = masknulls(arr)

    return arr, keep


# In[ ]:


# ### Calculations
# 
# Fst can be calculated per SNP, or averaged over sliding windows of SNPs of some size if reference information is present. Or over loci if 

# In[309]:


pop.results


# In[527]:


popdict = {'i': [2, 3], 'j': [2, 4]}
mindict = {}
(mindict if mindict else {j:1 for j in popdict})


# In[528]:


testdata1 = np.array([
    [0, 0, 0, 0, 0],
    [1, 0, 0, 0, 0],
    [1, 1, 1, 0, 0], 
    [1, 1, 1, 1, 0],
    [1, 1, 1, 1, 0], 
    [1, 1, 1, 1, 1],
])

testdata2 = np.random.binomial(1, 0.01, (6, 100))


# In[532]:


class Self:
    def __init__(self, data, popdict={}, mindict={}, mapfile=False):
        self.data = data
        self.popdict = popdict
        self.mindict = mindict
        self._parsedata()
        self._checkdicts()
        self._filterdata()
        
    def _parsedata(self):
        """
        parse data as an ndarray, vcfile, or locifile and store 
        in ... arrays, one with all sites, and mapfile... 
        """
        pass
    
    
    def _checkdicts(self):       
        #assert len(popdict) > 2, "must enter at least two populations"
        self.mindict = (mindict if mindict else {j:1 for j in popdict})
        
        
    def _filterdata(self):
        pass
        
        
    def fst_H(self, pop1idx, pop2idx):
        ii = list(itertools.combinations(pop1idx, 2))
        jj = list(itertools.combinations(pop2idx, 2))
        kk = itertools.combinations(pop1idx + pop2idx, 2)
        between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)    

        diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
        sums = np.sum(diff, axis=0)
        a = sums / sums.shape[0]

        diff = [self.data[i] != self.data[j] for (i, j) in between]
        sums = np.sum(diff, axis=0)
        b = sums / sums.shape[0]
        return abs(1 - (a / b))
    
    
    def theta_W(self):
        "return Watternson's theta for each population"
        df = pd.DataFrame(
            data=np.zeros(len(self.popdict)),
            columns=["theta"],
            index=list(self.popdict.keys())
        )
        for row in self.popdict:
            idat = self.data[self.popdict[row], :]
            segregating = np.invert(np.all(idat == idat[0], axis=0)).sum()
            denom = np.sum((1 / i for i in range(1, idat.shape[1] - 1)))
            theta = segregating / denom
            df.loc[row] = theta
        return df
            
        
    def pi(self, pop1idx, pop2idx):
        pass


# In[533]:


popdict = {'A': [0, 1], 'B': [2, 3, 4, 5]}
mindict = {'A': 1, 'B': 2}
self = Self(testdata2, popdict, mindict)


# In[535]:


self.theta_W()


# In[537]:


print(self.data)
print("\nFst per SNP:", self.fst_H([0, 1], [2, 3, 4, 5]))
print("\nFst mean:", np.mean(self.fst_H([0, 1], [2, 3, 4, 5])))
print("\ntheta per individual:", self.theta_W([0, 1], [2, 3, 4, 5]))
print("\ntheta per population:", self.theta_W([0, 1], [2, 3, 4, 5]))


# In[555]:


pd.DataFrame(
    data=np.zeros((2, 2)),
    index=['aaaaaa', 'bbbbbbbbbbb'],
)


# In[242]:


self.fst([0, 1], [2, 3, 4, 5])


# In[235]:


pop1idx = [0, 1]
pop2idx = [2, 3, 4, 5]
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]  
popa = self.data[pop1idx + pop2idx, :]


# In[74]:


ilist = itertools.combinations(range(4), 2)
list(ilist)


# In[75]:


ilist = itertools.combinations(range(pop2.shape[0]), 2)
list(ilist)


# In[80]:


def diffs(pop):
    ilist = itertools.combinations(range(pop.shape[0]), 2)
    diff = [pop[i] == pop[j] for (i, j) in ilist]
    sums = np.sum(diff, axis=0)
    return sums/sums.shape[0]


# In[231]:


def fst(pop1idx, pop2idx):
    ii = list(itertools.combinations(pop1idx, 2))
    jj = list(itertools.combinations(pop2idx, 2))
    kk = itertools.combinations(pop1idx + pop2idx, 2)
    between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)    

    diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
    sums = np.sum(diff, axis=0)
    a = sums / sums.shape[0]
    
    diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
    sums = np.sum(diff, axis=0)
    b = sums / sums.shape[0]
    
    return abs(1 - (a / b))


# In[175]:


#fst([0, 1], [2, 3, 4, 5])


# In[245]:


ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
within = ii + jj
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)


# In[246]:


ii + jj


# In[247]:


between


# In[244]:


self.data


# In[209]:


diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]

a


# In[243]:


abs(1 - (a / b))


# In[210]:


diff = [self.data[i] != self.data[j] for (i, j) in between]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]


# In[195]:


1 - (a / b)


# In[196]:


self.data


# In[171]:


between


# In[79]:


diffs(pop2) / diffs(popa)


# In[14]:


for i in range(pop1.shape[0]):
    for j in range(pop1.shape[0]):
        print(i,j)


# In[123]:


def _fstH(self, pop1idx, pop2idx):
    "Hudson's Fst estimator"
    pop1 = self.data[pop1idx, :]
    pop2 = self.data[pop2idx, :]  
    pop1freq = np.mean(pop1, axis=0)
    pop2freq = np.mean(pop2, axis=0)
    a = (pop1freq - pop2freq) ** 2
    b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
    c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
    return pd.Series(a - b - c).round(5)


# In[141]:


def _fstWC(self, pop1idx, pop2idx):
    "Wier and Cockerham (1984) Fst estimator"
    pop1 = self.data[pop1idx, :]
    pop2 = self.data[pop2idx, :]  
    popa = self.data[pop1idx + pop2idx, :]
    pop1freq = np.mean(pop1, axis=0)
    pop2freq = np.mean(pop2, axis=0)
    popafreq = np.mean(popa, axis=0)
    nbar = pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.
    s2a = (
        (1. / ((popa.shape[0] - 1) * nbar))
        * ((pop1.shape[0] * 1))
    )
    nc = (
        (1. / (npops - 1.)) 
         * (popa.shape[0] 
            - ((pop1.shape[0] ** 2 + pop2.shape[0] ** 2) / popa.shape[0])
           )
    )
    #t1 = 
    #t2 = 
    return t1, t2


# In[ ]:


def fstWC(self, pop1idx, pop2idx):
    pop1 = self.data[pop1idx, :]
    pop2 = self.data[pop2idx, :]  
    popa = self.data[pop1idx + pop2idx, :]
    
    pop1freq = np.mean(pop1, axis=0)
    pop2freq = np.mean(pop2, axis=0)
    popafreq = np.mean(popa, axis=0)
    
    # frequecy of allele A in the sample of size ni from population i
    p = ...
    # observed proportion of individuals heterozygous for allele A
    hbar = 
    # the average sample size
    nbar = 
    nc = 


# In[138]:


pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.


# In[140]:


pop1.shape[0] / 2. + pop2.shape[0] / 2.


# In[ ]:


npops= 2.0
nsamples = float(NpopA + NpopB)
n_bar= (NpopA / npops) + (NpopB / npops)
samplefreq = ( (popAcount+popBcount) / (NpopA + NpopB) )
pop1freq = popAcount / float(NpopA )
pop2freq = popBcount / float(NpopB )
Npop1 = NpopA
Npop2 = NpopB
S2A= (1/ ( (npops-1.0) * n_bar) ) * 
        ( ( (Npop1)* ((pop1freq-samplefreq)**2) ) + 
         ( (Npop2)*((pop2freq-samplefreq)**2) ) )
nc = 1.0/(npops-1.0) * ( (Npop1+Npop2) - (((Npop1**2)+(Npop2**2)) / (Npop1+Npop2)) )
T_1 = S2A -( ( 1/(n_bar-1) ) * ( (samplefreq * (1-samplefreq)) -  ((npops-1)/npops)* S2A ) )
T_2 = (( (nc-1) / (n_bar-1) ) * samplefreq *(1-samplefreq) )   +  (1.0 +   (((npops-1)*(n_bar-nc))  / (n_bar-1)))       * (S2A/npops)


# In[119]:


pop1idx = [0, 1]
pop2idx = [2, 3, 4]
_fst(self, pop1idx, pop2idx)


# In[120]:


pop1idx = [0, 4]
pop2idx = [1, 2, 3]
_fst(self, pop1idx, pop2idx)


# In[22]:


pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]

print(pop1)
print(pop2)


# In[104]:


pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)


# In[105]:


a = (pop1freq - pop2freq) ** 2
a


# In[106]:


b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
b


# In[107]:


c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
c


# In[108]:


pd.Series(a - b - c).round(3)


# In[98]:


pd.Series(a - b - c).round(3)


# In[114]:


d = (pop1freq * (1.0 - pop2freq)) + (pop1freq * (1.0 - pop2freq))
d


# In[151]:


(np.mean(popa, axis=0) - np.mean(pop2, axis=0)) / (np.mean(popa, axis=0))


# In[152]:


popa


# In[153]:


np.count_nonzero(pop2, axis=0)


# In[169]:


a = np.mean(popa, axis=0)
mask = a > 0.5
a[a > 0.5] = abs(a[a > 0.5] - 1)
a


# In[172]:


b = np.mean(pop2, axis=0)
mask = b > 0.5
b[b > 0.5] = abs(b[b > 0.5] - 1)
b


# In[171]:


(a - b) / a


# In[ ]: