#!/usr/bin/env python # coding: utf-8 # ## (📗) Cookbook: `ipyrad.analysis simulations` # # This notebook demonstrates how to use the `baba` module in `ipyrad` to test for admixture and introgression. The code is written in Python and is best implemented in a Jupyter-notebook like this one. Analyses can be split up among many computing cores. Finally, there are some simple plotting methods to accompany tests which I show below. # ### import packages # In[1]: ## imports import numpy as np import ipyrad as ip import ipyparallel as ipp from ipyrad.analysis import baba ## print versions print "ipyrad v.{}".format(ip.__version__) print "ipyparallel v.{}".format(ipp.__version__) print "numpy v.{}".format(np.__version__) # ### ipcluster setup to run parallel code # You can find more details about this in our [ipyparallel tutorial]. # In[2]: ## start 4 engines by running the commented line below in a separate bash terminal. ## ## ipcluster start --n=4 # In[3]: ## connect to client and print connection summary ipyclient = ipp.Client() print ip.cluster_info(client=ipyclient) # ### Get default (example) 12 taxon tree # The function `baba.Tree()` takes a `newick` string as an argument, or, if no value is passed, it constructs the default 12 taxon tree shown below. The `Tree` Class object holds a representation of the tree, which can be used for plotting, as well as for describing a demographic model for simulating data, as we'll show below. # In[4]: tre.draw(width=400, height=200); # In[38]: ## tree2dict import ete3 newick = "./analysis_tetrad/pedtest1.full.tre" ## load the newick string as a Tree object tre = ete3.Tree(newick) ## set przewalskiis as the outgroup prz = [i for i in tre.get_leaves() if "prz" in i.name] out = tre.get_common_ancestor(prz) ## set new outgroup and store back as a newick string tre.set_outgroup(out) newick = tre.write() def test_constraint(node, cdict, tip, exact): names = set(node.get_leaf_names()) const = set(cdict[tip]) if const: if exact: if len(names.intersection(const)) == len(const): return 1 else: return 0 else: if len(names.intersection(const)) == len(names): return 1 else: return 0 return 1 # In[39]: ## constraints #if not constraint_dict: cdict = {"p1":[], "p2":[], "p3":[], "p4":[]} cdict.update({"p4":['a']}) cdict # In[45]: def tree2tests(newick, constraint_dict=None, constraint_exact=True): """ Returns dict of all possible four-taxon splits in a tree. Assumes the user has entered a rooted tree. Skips polytomies. """ ## make tree tree = ete3.Tree(newick) ## constraints #if not constraint_dict: cdict = {"p1":[], "p2":[], "p3":[], "p4":[]} cdict.update(constraint_dict) print cdict ## traverse root to tips. Treat the left as outgroup, then the right. tests = [] ## topnode must have children for topnode in tree.traverse("levelorder"): ## test onode as either child for oparent in topnode.children: ## test outgroup as all descendants on one child branch for onode in oparent.traverse(): ## put constraints on onode if test_constraint(onode, cdict, "p4", constraint_exact): ## p123 parent is sister to oparent p123parent = oparent.get_sisters()[0] ## test p3 as all descendants of p123parent for p3parent in p123parent.children: ## p12 parent is sister to p3parent p12parent = p3parent.get_sisters()[0] for p3node in p3parent.traverse(): if test_constraint(p3node, cdict, "p3", constraint_exact): if p12parent.children: p1parent, p2parent = p12parent.children for p2node in p2parent.traverse(): if test_constraint(p2node, cdict, "p2", constraint_exact): for p1node in p1parent.traverse(): if test_constraint(p1node, cdict, "p1", constraint_exact): test = {} test['p4'] = onode.get_leaf_names() test['p3'] = p3node.get_leaf_names() test['p2'] = p2node.get_leaf_names() test['p1'] = p1node.get_leaf_names() tests.append(test) return tests # In[46]: tests = tree2tests(newick, constraint_dict={"p4":['32082_przewalskii', '33588_przewalskii'], "p3":['30686_cyathophylla']}, constraint_exact=True) len(tests) # In[43]: tests # ### Or get any arbitrary tree # In[5]: ## pass a newick string to the Tree class object tre = baba.Tree(newick="((a,b),c);") tre.draw(width=200, height=200); # ### get a tree with an admixture edge # Shown by an arrow pointing backwards from source to sink (indicating that geneflow should be entered as occurring backwards in time). This is how the program `msprime` will interpret it. # In[6]: ## store newick tree and a migration event list [source, sink, start, stop, rate] ## if not edges are entered then it is converted to cladogram newick = "(((a,b),Cow), (d,e));" events = [['e', 'Cow', 0, 1, 1e-6], ['3', '1', 1, 2, 1e-6]] ## initiate Tree object with newick and admix args tre = baba.Tree(newick=newick, admix=events) ## show the tree tre.draw(width=250, height=250, yaxis=True); # In[7]: ## a way of finding names xpos from verts tre.verts[tre.verts[:, 1] == 0] tre.tree.search_nodes(name='b')[0].idx tre.verts[6, 0] print [tre.verts[tre.tree.search_nodes(name=name)[0].idx, 0] for name in tre.tree.get_leaf_names()] print tre.tree.get_leaf_names() # ### simulate data on that tree # # In[8]: ## returns a Sim data object sims = tre.simulate(nreps=10000, Ns=50000, gen=20) # In[9]: ## what is in the sims object? 4 attributes: for key, val in sims.__dict__.items(): print key, val # In[10]: baba._msp_to_arr(sims, test) # In[31]: def _msp_to_arr(Sim, test): ## the fixed tree dictionary #fix = {j: [i, i+1] for j, i in zip(list("abcdefghijkl"), range(0, 24, 2))} fix = {j: [i, i+1] for j, i in zip(Sim.names, range(0, len(Sim.names)*2, 2))} ## fill taxdict by test keys = ['p1', 'p2', 'p3', 'p4'] arr = np.zeros((Sim.nreps, 4, 100)) ## unless it's a 5-taxon test if len(test) == 5: arr = np.zeros((100000, 6, 100)) keys += ['p5'] ## create array sampler for taxa taxs = [test[key] for key in keys] idxs = [list(itertools.chain(*[fix[j] for j in i])) for i in taxs] print idxs # In[32]: from ipyrad.analysis.baba import * _msp_to_arr(sims, test) # ### A function to print the variant matrix of one rep. # This is used just for demonstration. The matrix will have 2X as many columns as there are # tips in the tree, corresponding to two alleles per individual. The number of rows is the # number of variant sites simulated. # In[39]: def print_variant_matrix(tree): shape = tree.get_num_mutations(), tree.get_sample_size() arr = np.empty(shape, dtype="u1") for variant in tree.variants(): arr[variant.index] = variant.genotypes print(arr) # In[45]: ## in order of ladderized tip names (2 copies per tip) print_variant_matrix(sims.sims.next()) # ### Simulate data for ABBA-BABA tests # In[46]: sims # In[12]: ## simulate data on tree #sims = tre.simulate(nreps=10000, Ns=50000, gen=20) ## pass sim object to baba test = { 'p4': ['e', 'd'], 'p3': ['Cow'], 'p2': ['b'], 'p1': ['a'], } #baba.batch(sims, test, ipyclient=ipyclient) # In[ ]: