#!/usr/bin/env python
# coding: utf-8

# ## (&#x1F4D7;) Cookbook: `ipyrad.analysis simulations` 
# 
# This notebook demonstrates how to use the `baba` module in `ipyrad` to test for admixture and introgression. The code is written in Python and is best implemented in a Jupyter-notebook like this one. Analyses can be split up among many computing cores. Finally, there are some simple plotting methods to accompany tests which I show below. 

# ### import packages

# In[1]:


## imports
import numpy as np
import ipyrad as ip
import ipyparallel as ipp
from ipyrad.analysis import baba

## print versions
print "ipyrad v.{}".format(ip.__version__)
print "ipyparallel v.{}".format(ipp.__version__)
print "numpy v.{}".format(np.__version__)


# ### ipcluster setup to run parallel code
# You can find more details about this in our [ipyparallel tutorial].

# In[2]:


## start 4 engines by running the commented line below in a separate bash terminal. 
##
## ipcluster start --n=4


# In[3]:


## connect to client and print connection summary
ipyclient = ipp.Client()
print ip.cluster_info(client=ipyclient)


# ### Get default (example) 12 taxon tree 
# The function  `baba.Tree()`  takes a  `newick`  string as an argument, or, if no value is passed, it constructs the default 12 taxon tree shown below. The  `Tree`  Class object holds a representation of the tree, which can be used for plotting, as well as for describing a demographic model for simulating data, as we'll show below. 

# In[4]:


tre.draw(width=400, height=200);


# In[38]:


## tree2dict 
import ete3
newick = "./analysis_tetrad/pedtest1.full.tre"
## load the newick string as a Tree object
tre = ete3.Tree(newick)

## set przewalskiis as the outgroup
prz = [i for i in tre.get_leaves() if "prz" in i.name]
out = tre.get_common_ancestor(prz)

## set new outgroup and store back as a newick string
tre.set_outgroup(out)
newick = tre.write()


def test_constraint(node, cdict, tip, exact):
    names = set(node.get_leaf_names())
    const = set(cdict[tip])
    if const:
        if exact:
            if len(names.intersection(const)) == len(const):
                return 1
            else:
                return 0
        else:
            if len(names.intersection(const)) == len(names):
                return 1
            else:
                return 0        
    return 1
    

# In[39]:


## constraints
#if not constraint_dict:
cdict = {"p1":[], "p2":[], "p3":[], "p4":[]}
cdict.update({"p4":['a']})

cdict


# In[45]:


def tree2tests(newick, constraint_dict=None, constraint_exact=True):
    """
    Returns dict of all possible four-taxon splits in a tree. Assumes
    the user has entered a rooted tree. Skips polytomies.
    """
    ## make tree
    tree = ete3.Tree(newick)
    
    ## constraints
    #if not constraint_dict:
    cdict = {"p1":[], "p2":[], "p3":[], "p4":[]}
    cdict.update(constraint_dict)
    print cdict

    ## traverse root to tips. Treat the left as outgroup, then the right.
    tests = []
    ## topnode must have children
    for topnode in tree.traverse("levelorder"):  
        ## test onode as either child
        for oparent in topnode.children:
            ## test outgroup as all descendants on one child branch
            for onode in oparent.traverse():
                ## put constraints on onode
                if test_constraint(onode, cdict, "p4", constraint_exact):
                    ## p123 parent is sister to oparent
                    p123parent = oparent.get_sisters()[0]
                    ## test p3 as all descendants of p123parent
                    for p3parent in p123parent.children:
                        ## p12 parent is sister to p3parent
                        p12parent = p3parent.get_sisters()[0]
                        for p3node in p3parent.traverse():
                            if test_constraint(p3node, cdict, "p3", constraint_exact):
                                if p12parent.children:
                                    p1parent, p2parent = p12parent.children
                                    for p2node in p2parent.traverse():
                                        if test_constraint(p2node, cdict, "p2", constraint_exact):
                                            for p1node in p1parent.traverse():
                                                if test_constraint(p1node, cdict, "p1", constraint_exact):
                                                    test = {}
                                                    test['p4'] = onode.get_leaf_names()
                                                    test['p3'] = p3node.get_leaf_names()
                                                    test['p2'] = p2node.get_leaf_names()
                                                    test['p1'] = p1node.get_leaf_names()
                                                    tests.append(test)
    return tests


# In[46]:


tests = tree2tests(newick, 
                   constraint_dict={"p4":['32082_przewalskii', '33588_przewalskii'], 
                                    "p3":['30686_cyathophylla']}, 
                   constraint_exact=True)
len(tests)


# In[43]:


tests


# ### Or get any arbitrary tree

# In[5]:


## pass a newick string to the Tree class object
tre = baba.Tree(newick="((a,b),c);")
tre.draw(width=200, height=200);


# ### get a tree with an admixture edge
# Shown by an arrow pointing backwards from source to sink (indicating that geneflow should be entered as occurring backwards in time). This is how the program `msprime` will interpret it. 

# In[6]:


## store newick tree and a migration event list [source, sink, start, stop, rate]
## if not edges are entered then it is converted to cladogram
newick = "(((a,b),Cow), (d,e));"
events = [['e', 'Cow', 0, 1, 1e-6], 
          ['3', '1', 1, 2, 1e-6]]

## initiate Tree object with newick and admix args
tre = baba.Tree(newick=newick, 
                admix=events)

## show the tree 
tre.draw(width=250, height=250, yaxis=True);


# In[7]:


## a way of finding names xpos from verts
tre.verts[tre.verts[:, 1] == 0]
tre.tree.search_nodes(name='b')[0].idx

tre.verts[6, 0]

print [tre.verts[tre.tree.search_nodes(name=name)[0].idx, 0]
                for name in tre.tree.get_leaf_names()]

print tre.tree.get_leaf_names()


# ### simulate data on that tree
# 

# In[8]:


## returns a Sim data object
sims = tre.simulate(nreps=10000, Ns=50000, gen=20)


# In[9]:


## what is in the sims object? 4 attributes:
for key, val in sims.__dict__.items():
    print key, val


# In[10]:


baba._msp_to_arr(sims, test)


# In[31]:


def _msp_to_arr(Sim, test):
    
    ## the fixed tree dictionary
    #fix = {j: [i, i+1] for j, i in zip(list("abcdefghijkl"), range(0, 24, 2))}
    fix = {j: [i, i+1] for j, i in zip(Sim.names, range(0, len(Sim.names)*2, 2))}
    
    ## fill taxdict by test
    keys = ['p1', 'p2', 'p3', 'p4']
    arr = np.zeros((Sim.nreps, 4, 100))
    
    ## unless it's a 5-taxon test
    if len(test) == 5:
        arr = np.zeros((100000, 6, 100))
        keys += ['p5']
    
    ## create array sampler for taxa
    taxs = [test[key] for key in keys]
    idxs = [list(itertools.chain(*[fix[j] for j in i])) for i in taxs]
    print idxs


# In[32]:


from ipyrad.analysis.baba import *
_msp_to_arr(sims, test)


# ### A function to print the variant matrix of one rep.
# This is used just for demonstration. The matrix will have 2X as many columns as there are 
# tips in the tree, corresponding to two alleles per individual. The number of rows is the 
# number of variant sites simulated. 

# In[39]:


def print_variant_matrix(tree):
    shape = tree.get_num_mutations(), tree.get_sample_size()
    arr = np.empty(shape, dtype="u1")
    for variant in tree.variants():
        arr[variant.index] = variant.genotypes
    print(arr)


# In[45]:


## in order of ladderized tip names (2 copies per tip)
print_variant_matrix(sims.sims.next())


# ### Simulate data for ABBA-BABA tests

# In[46]:


sims


# In[12]:


## simulate data on tree
#sims = tre.simulate(nreps=10000, Ns=50000, gen=20)

## pass sim object to baba
test = {
    'p4': ['e', 'd'],
    'p3': ['Cow'], 
    'p2': ['b'],
    'p1': ['a'],    
}

#baba.batch(sims, test, ipyclient=ipyclient)


# In[ ]: