#!/usr/bin/env python
# coding: utf-8
#
ipyrad-analysis toolkit: tetrad
#
# The `tetrad` tool is a framework for inferring a species tree topology using quartet-joining on large unlinked SNP data sets. It is particularly optimized for RAD-seq type datasets that are likely to involve a lot of missing data.
# ### Load libraries
# In[38]:
# conda install ipyrad -c conda-forge -c bioconda
# conda install tetrad -c conda-forge
# In[2]:
import ipyrad.analysis as ipa
import toytree
import ipcoal
# ### Simulate a random tree with 20 tips and crown age of 10M generations
# In[15]:
tree = toytree.rtree.bdtree(ntips=20, seed=555)
tree = tree.mod.node_scale_root_height(10e6)
tree.draw(scalebar=True);
# ### Simulate SNPs with missing data and write to database (.seqs.hdf5)
# In[16]:
# init simulator with one diploid sample from each tip
model = ipcoal.Model(tree, Ne=1e6, nsamples=2, recomb=0)
# simulate sequence data on 10K loci
model.sim_loci(10000, 50)
# add missing data (50%)
model.apply_missing_mask(0.5)
# write results to database file
model.write_snps_to_hdf5(name="test-tet-miss50", outdir='/tmp', diploid=True)
# ### Infer tetrad tree
# In[17]:
SNPS = "/tmp/test-tet-miss50.snps.hdf5"
# In[19]:
tet = ipa.tetrad(
data=SNPS,
name="test-tet-miss50",
workdir="/tmp",
nboots=10,
nquartets=1e6,
)
# In[20]:
tet.run(auto=True, force=True)
# ### Draw the inferred tetrad tree
# In[30]:
tre = toytree.tree(tet.trees.cons)
rtre = tre.root(["r19", "r18", "r17"])
rtre.draw(ts='d', use_edge_lengths=False, node_labels="support");
# ### Does this tree match the *true* species tree?
# In[37]:
rfdist = rtre.treenode.robinson_foulds(tree.treenode)[0]
rfdist == 0