#!/usr/bin/env python # coding: utf-8 # ### Notebook 16: Simulating RADseq data # #### Eaton et al. 2015 # In[22]: ## Requirements ## - Python 2.7 ## - pyrad v.3.1.0 (http://github.com/dereneaton/pyrad) ## - simrrls v.0.0.7 (http://github.com/dereneaton/simrrls) # In[150]: import itertools import ete2 import numpy as np import toyplot from collections import OrderedDict, Counter # ### Generate trees for simulations # Make a balanced tree with 64 tips and a tree length = 10 # In[151]: for n, node in enumerate(Tbal.get_leaves()): node.name = "t"+str(n) # In[152]: print Tbal.write() # In[153]: ## base tree Tbal = ete2.Tree() ## branch lengths bls = 10/6. ## first nodes n1 = Tbal.add_child(dist=bls) n2 = Tbal.add_child(dist=bls) ## make balanced tree while len(Tbal.get_leaves()) < 64: thisrep = Tbal.get_descendants() for node in thisrep: if len(node.get_children()) < 1: node.add_child(dist=bls) node.add_child(dist=bls) ## set leaf names for n, node in enumerate(Tbal.get_leaves()): node.name = "t"+str(n) ## Save newick string to file Tbal.write(outfile="Tbal.tre", format=3) # In[154]: cat Tbal.tre # ### Make an imbalanced tree of the same total tree length with 64 tips # In[155]: ## base tree Timb = ete2.Tree() ## scale branches to match balanced treelength brlen = (bls*6.)/63 ## first nodes n1 = Timb.add_child(dist=brlen) n2 = Timb.add_child(dist=brlen) while len(Timb.get_leaves()) < 64: ## extend others for tip in Timb.get_leaves()[:-1]: tip.dist += brlen ## extend the last node Timb.get_leaves()[-1].add_child(dist=brlen) Timb.get_leaves()[-1].add_sister(dist=brlen) ## set leaf names for n, node in enumerate(Timb.get_leaves()): node.name = "t"+str(n) ## write to file Timb.write(outfile="Timb.tre", format=3) # In[156]: cat Timb.tre # ### Check tree lengths # In[157]: print set([i.get_distance(Tbal) for i in Tbal]), 'treelength' print len(Tbal), 'tips' print set([i.get_distance(Timb) for i in Timb]), 'treelength' print len(Timb), 'tips' # ### Get topology from empirical study of Viburnum with 64 tips # Make tree ultrametric using the penalized likelihood # In[158]: get_ipython().run_line_magic('load_ext', 'rpy2.ipython') # In[159]: get_ipython().run_cell_magic('R', '-w 400 -h 600', 'library(ape)\n\n## make tree ultrametric using penalized likelihood\nVtree <- read.tree("~/Dropbox/RAxML_bestTree.VIB_small_c85d6m4p99")\nUtree <- ladderize(chronopl(Vtree, 0.5))\nUtree <- drop.tip(Utree, "clemensiae_DRY6_PWS_2135")\n\n## multiply bls so tree length=6\nUtree$edge.length <- Utree$edge.length*3\n\n## save the new tree\nwrite.tree(Utree, "Tvib.tre")\nplot(Utree, cex=0.7, edge.width=2)\n#edgelabels(round(Utree$edge.length,3))\n') # In[160]: #### load TVib tree into Python and print newick string Tvib = ete2.Tree("Tvib.tre") get_ipython().system(' cat Tvib.tre') # # Simulate sequence data on each tree # Here I use the _simrrls_ program to simulate RADseq data on each input topology with locus dropout occurring with respect to phylogenetic distances. Find simrrls in my github profile. # In[161]: get_ipython().run_cell_magic('bash', '', '## balanced tree \nmkdir -p simulations/Tbal_full/\nmkdir -p simulations/Tbal_mut/\nmkdir -p simulations/Tbal_cov/\n\n## imbalanced tree\nmkdir -p simulations/Timb_full/\nmkdir -p simulations/Timb_mut/\nmkdir -p simulations/Timb_cov/\n\n## empirical Viburnum tree\nmkdir -p simulations/Tvib_full/\nmkdir -p simulations/Tvib_mut/\nmkdir -p simulations/Tvib_cov/\n') # #### show simrrls options # In[163]: get_ipython().run_cell_magic('bash', '', 'simrrls -h\n') # ### Simulate RAD data on the balanced tree without missing data # In[98]: get_ipython().run_cell_magic('bash', '', 'simrrls -mc 0 -ms 0 -t Tbal.tre \\\n -L 1000 -l 100 \\\n -u 1e-9 -N 5e5 \\\n -f rad -c1 CTGCAG \\\n -o simulations/Tbal_full/Tbal\n') # ### And with missing data from mutation-disruption # In[138]: get_ipython().run_cell_magic('bash', '', 'simrrls -mc 1 -ms 1 -t Tbal.tre \\\n -L 1000 -l 100 \\\n -u 1e-9 -N 5e5 \\\n -f rad -c CTGCAG \\\n -s 300,600 \\\n -o simulations/Tbal_mut/Tbal\n') # ### And with missing data from low sequencing coverage # In[147]: get_ipython().run_cell_magic('bash', '', 'simrrls -D 0 -t Tbal.tre \\\n -L 1000 -l 100 \\\n -u 1e-9 -N 5e5 \\\n -f rad -c CTGCAG \\\n -d 5,5 \\\n -o simulations/Tbal_cov/Tbal\n') # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # ### Assemble data sets in _pyRAD_ (v. 3.1.0) # In[31]: get_ipython().run_cell_magic('bash', '', '## new params file\npyrad -n\n') # In[32]: get_ipython().run_cell_magic('bash', '', "## add phy and nex outputs\nsed -i '/## 1. /c\\simulations/Tbal_full/ ## 1. working dir ' params.txt\nsed -i '/## 2. /c\\simulations/Tbal_full/*.gz ## 2. data loc ' params.txt\nsed -i '/## 3. /c\\simulations/Tbal_full/*barcodes.txt ## 3. Bcode ' params.txt\nsed -i '/## 6. /c\\TGCAG ## 6. Cutter ' params.txt\nsed -i '/## 7. /c\\4 ## 7. Nproc ' params.txt\nsed -i '/## 10. /c\\.82 ## 10. clust thresh' params.txt\nsed -i '/## 11. /c\\rad ## 11. datatype ' params.txt\nsed -i '/## 13. /c\\6 ## 13. maxSH' params.txt\nsed -i '/## 14. /c\\Tbal ## 14. outname' params.txt\nsed -i '/## 16. /c\\ ## 16. addon taxa' params.txt\nsed -i '/## 24./c\\99 ## 24. maxH' params.txt\nsed -i '/## 30./c\\n,p,s ## 30. out format' params.txt\n") # In[99]: get_ipython().run_cell_magic('bash', '', 'pyrad -p params.txt -q\n') # In[139]: get_ipython().run_cell_magic('bash', '', "sed -i '/## 1. /c\\simulations/Tbal_mut ## 1. working dir ' params.txt\nsed -i '/## 2. /c\\simulations/Tbal_mut/*.gz ## 2. data loc ' params.txt\nsed -i '/## 3. /c\\simulations/Tbal_mut/*barcodes.txt ## 3. Bcode ' params.txt\n") # In[140]: get_ipython().run_cell_magic('bash', '', 'pyrad -p params.txt -q\n') # In[148]: get_ipython().run_cell_magic('bash', '', "sed -i '/## 1. /c\\simulations/Tbal_cov ## 1. working dir ' params.txt\nsed -i '/## 2. /c\\simulations/Tbal_cov/*.gz ## 2. data loc ' params.txt\nsed -i '/## 3. /c\\simulations/Tbal_cov/*barcodes.txt ## 3. Bcode ' params.txt\n") # In[149]: get_ipython().run_cell_magic('bash', '', 'pyrad -p params.txt -q\n') # In[ ]: # ### Functions for measuring shared data # In[141]: def getarray(loci, tree): """ parse the loci list and return a presence/absence matrix ordered by the tips on the tree""" ## order (ladderize) the tree tree.ladderize() ## get tip names names = tree.get_leaf_names() ## make empty matrix lxs = np.zeros((len(names), len(loci))) ## fill the matrix for loc in xrange(len(loci)): for seq in loci[loc].split("\n"): if ">" in seq: tname = seq.split()[0][1:-1] lxs[names.index(tname),loc] += 1 return lxs # In[142]: def countmatrix(lxsabove, lxsbelow, max=0): """ fill a matrix with pairwise data sharing between each pair of samples. You could put in two different 'share' matrices to have different results above and below the diagonal. Can enter a max value to limit fill along diagonal. """ share = np.zeros((lxsabove.shape[0], lxsbelow.shape[0])) ## fill above names = range(lxsabove.shape[0]) for row in lxsabove: for samp1,samp2 in itertools.combinations(names,2): shared = lxsabove[samp1, lxsabove[samp2,]>0].sum() share[samp1,samp2] = shared ## fill below for row in lxsbelow: for samp2,samp1 in itertools.combinations(names,2): shared = lxsabove[samp1, lxsabove[samp2,]>0].sum() share[samp1,samp2] = shared ## fill diagonal if not max: for row in range(len(names)): share[row,row] = lxsabove[row,].sum() else: for row in range(len(names)): share[row,row] = max return share # In[143]: locidata = open("simulations/Tbal_mut/outfiles/Tbal.loci") loci = locidata.read().split("|\n")[:-1] lxs = getarray(loci, Tbal) print lxs.shape print lxs # In[144]: share = countmatrix(lxs, lxs) print share.shape print share # ### Plotting function # In[145]: ## Get ordered names of tips on the tree tree = Tbal tree.ladderize() names = tree.get_leaf_names() floater = ["Taxon: %s" % i for i in names] # In[146]: colormap = toyplot.color.LinearMap(toyplot.color.brewer("Spectral"), domain_min=share.min(), domain_max=share.max()) canvas = toyplot.Canvas(width=1200, height=900) table = canvas.matrix(share, colormap=colormap, label="", bounds=(50, 500, 50, 500), step=5) ## make floater for grid for i,j in itertools.product(range(len(share)), repeat=2): table.body.cell(i,j).title='%s, %s : %s' % (names[i], names[j], int(share[i,j])) ## put box around grid table.body.grid.vlines[...,[0,-1]] = 'single' table.body.grid.hlines[[0,-1],...] = 'single' ## remove top and left labels for j in range(share.shape[1]): table.top.cell(0, j).data = "" for i in range(share.shape[0]): table.left.cell(i, 0).data = "" ## canvas for barplot axes = canvas.axes(bounds=(550, 650, 60, 510), label="", xlabel="", ylabel="") ## create barplot axes.bars(share.diagonal()[::-1], along="y", title = floater) ## make floater for barplot zf = zip(names[::-1], share.diagonal()[::-1]) barfloater = ["%s: %s" % (i,int(j)) for i,j in zf] ## Hide yspine, move labels to the left, ## use taxon names, rotate angle, align. axes.y.spine.show = False axes.y.ticks.labels.offset = -5 axes.y.ticks.locator = toyplot.locator.Explicit(range(64), labels=names[::-1]) axes.y.ticks.labels.angle = 0 axes.y.ticks.labels.style = {"baseline-shift":0, "text-anchor":"end", "font-size":"9px"} ## Rotate xlabels, align with ticks, ## change to thousands, move up on canvas, ## show ticks, and hide popup coordinates axes.x.ticks.labels.angle = 90 axes.x.ticks.labels.offset = 20 axes.x.ticks.locator = toyplot.locator.Explicit( [0,200,400,600,800,1000], ["0", "200", "400", "600", "800", "1000"]) axes.x.ticks.labels.style = {"baseline-shift":0, "text-anchor":"end", "-toyplot-anchor-shift":"15px"} axes.x.ticks.show = True axes.coordinates.show = False # In[ ]: