#!/usr/bin/env python # coding: utf-8 # # DaCosta & Sorenson (2016) single-end dd-RAD data set # Here we demonstrate a denovo assembly for an empirical RAD data set using the ipyrad Python API. This example was run on a workstation with 20 cores available and takes about <10 minutes to run completely, but can be run on even a laptop in about less than an hour. # # We will use the Lagonosticta and Vidua data set from [DaCosta & Sorenson 2016](https://www.ncbi.nlm.nih.gov/pubmed/26279345). This data set is composed of single-end 101bp reads from a ddRAD-seq library prepared with the SbfI and EcoRI enzymes and is available on NCBI by its study accession SRP059199. At the end of this notebook we also demonstrate the use of ipyrad.analysis tools to run downstream analyses on this data set. # # The figure below from this paper shows the general workflow in which two fairly distinct clades were sequenced together but then analyzed separately. # # # ## Setup (software and data files) # If you haven't done so yet, start by installing `ipyrad` using conda (see ipyrad installation instructions) as well as the packages in the cell below. This is easiest to do in a terminal. Then open a jupyter-notebook, like this one, and follow along with the tutorial by copying and executing the code in the cells, and adding your own documentation between them using markdown. Feel free to modify parameters to see their effects on the downstream results. # In[1]: ## conda install ipyrad -c ipyrad ## conda install toytree -c eaton-lab ## conda install entrez-direct -c bioconda ## conda install sratools -c bioconda # In[8]: ## imports import ipyrad as ip import ipyrad.analysis as ipa import ipyparallel as ipp # In contrast to the ipyrad CLI, the ipyrad API gives users much more fine-scale control over the parallelization of their analysis, but this also requires learning a little bit about the library that we use to do this, called `ipyparallel`. This library is designed for use with jupyter-notebooks to allow massive-scale multi-processing while working *interactively*. # # Understanding the nuts and bolts of it might take a little while, but it is fairly easy to get started using it, especially in the way it is integrated with ipyrad. To start a parallel client to you must run the command-line program '`ipcluster`'. This will essentially start a number of independent Python processes (kernels) which we can then send bits of work to do. The cluster can be stopped and restarted independently of this notebook, which is convenient for working on a cluster where connecting to many cores is not always immediately available. # # Open a terminal and type the following command to start an ipcluster instance with N engines. # In[9]: ## ipcluster start --n=20 # In[11]: ## connect to cluster ipyclient = ipp.Client() ipyclient.ids # ### Download the data set (Finches) # These data are archived on the NCBI sequence read archive (SRA) under accession id SRP059199. For convenience, the data are also hosted at a public Dropbox link which is a bit easier to access. Run the code below to download and decompress the fastq data files, which will save them into a directory called fastqs-Finches/. # In[3]: ## download the Pedicularis data set from NCBI sra = ipa.sratools(accession="SRP059199", workdir="fastqs-Finches") sra.run(force=True, ipyclient=ipyclient) # ## Create an Assembly object # This object stores the parameters of the assembly and the organization of data files. # In[4]: ## you must provide a name for the Assembly data = ip.Assembly("Finches") # Set parameters for the Assembly. This will raise an error if any of the parameters are not allowed because they are the wrong type, or out of the allowed range. # In[6]: ## set parameters data.set_params("project_dir", "analysis-ipyrad/Finches") data.set_params("sorted_fastq_path", "fastqs-Finches/*.fastq.gz") data.set_params("datatype", "ddrad") data.set_params("restriction_overhang", ("CCTGCAGG", "AATTC")) data.set_params("clust_threshold", "0.85") data.set_params("filter_adapters", "2") data.set_params("max_Hs_consens", (5, 5)) data.set_params("output_formats", "psvnkua") ## see/print all parameters data.get_params() # ## Assemble the data set # # In[7]: ## run steps 1 & 2 of the assembly data.run("12") # In[8]: ## access the stats of the assembly (so far) from the .stats attribute data.stats # In[9]: ## run steps 3-5 (within-sample steps) of the assembly data.run("345") # ### Branch to create separate data sets for Vidua and Lagonistica # In[11]: ## create data set with only Vidua samples + outgroup subs = [i for i in data.samples if "Vidua" in i] +\ [i for i in data.samples if "Anomalo" in i] vidua = data.branch("vidua", subsamples=subs) ## create data set with only Lagonostica sampes + outgroup subs = [i for i in data.samples if "Lagon" in i] +\ [i for i in data.samples if "Clyto" in i] lagon = data.branch("lagon", subsamples=subs) # ### Assemble each data set through final steps # Or, if you are pressed for time, you can choose just one of the Assemblies going forward. If you do, you may want to choose Vidua since that is the one we use in the downstream analysis tools at the end of this notebook. # In[15]: vidua.run("6") # In[16]: lagon.run("6") # ### Branch to create several final data sets with different parameter settings # # Here we use nested for-loops to iterate over assemblies and parameter values. # # In[17]: ## iterate over data set and parameters for assembly in [vidua, lagon]: for min_sample in [4, 10]: ## create new assembly, apply new name and parameters newname = "{}_min{}".format(assembly.name, min_sample) newdata = assembly.branch(newname) newdata.set_params("min_samples_locus", min_sample) ## run step 7 newdata.run("7") # ### View final stats # The .stats attribute shows a stats summary for each sample, and a number of stats dataframes can be accessed for each step from the .stats_dfs attribute of the Assembly. # In[3]: vm4 = ip.load_json("analysis-ipyrad/Finches/vidua_min4.json") vm4.stats # In[4]: lm4 = ip.load_json("analysis-ipyrad/Finches/lagon_min4.json") lm4.stats # In[5]: ## or read the full stats file as a bash command (cat) get_ipython().system('cat $vm4.stats_files.s7') # In[6]: ## the same full stats for lagon get_ipython().system('cat $lm4.stats_files.s7') # ## Analysis tools # Thee is a lot more information about analysis tools in the ipyrad documentation. But here I'll show just a quick example of how you can easily access the data files for these assemblies and use them in downstream analysis software. The ipyrad analysis tools include convenient wrappers to make it easier to parallelize analyses of RAD-seq data. You should still read the full tutorial of the software you are using to understand the full scope of the parameters involved and their impacts, but once you understand that, the ipyrad analysis tools provide an easy way to setup up scripts to sample different distributions of SNPs and to run many replicates in parallel. # In[13]: import ipyrad.analysis as ipa # In[14]: ## you can re-load assemblies at a later time from their JSON file min4 = ip.load_json("analysis-ipyrad/Finches/vidua_min4.json") min10 = ip.load_json("analysis-ipyrad/Finches/vidua_min10.json") # #### *RAxML* -- ML concatenation tree inference # In[29]: ## conda install raxml -c bioconda ## conda install toytree -c eaton-lab # In[48]: ## create a raxml analysis object for the min13 data sets rax = ipa.raxml( name=min10.name, data=min10.outfiles.phy, workdir="analysis-raxml", T=20, N=100, o=[i for i in min10.samples if "Ano" in i], ) # In[49]: ## print the raxml command and call it print rax.command rax.run(force=True) # In[50]: ## access the resulting tree files rax.trees # In[51]: ## plot a tree in the notebook with toytree import toytree tre = toytree.tree(rax.trees.bipartitions) tre.root(wildcard="Ano") tre.draw( width=350, height=400, node_labels=tre.get_node_values("support"), #use_edge_lengths=True, ); # ### *tetrad* -- quartet tree inference # In[39]: ## create a tetrad analysis object tet = ipa.tetrad( name=min4.name, seqfile=min4.outfiles.snpsphy, mapfile=min4.outfiles.snpsmap, nboots=100, ) # In[40]: ## run tree inference tet.run(ipyclient) # In[41]: ## access tree files tet.trees # In[112]: ## plot results (just like above, but unrooted by default) import toytree qtre = toytree.tree(tet.trees.nhx) qtre.root(wildcard="Ano") qtre.draw( width=350, height=400, node_labels=tre.get_node_values("support"), ); # In[54]: ## draw a cloud-tree to see variation among bootstrap trees ## note that the trees are UNROOTED here, but tips are in the ## same order in all trees. boots = toytree.multitree(tet.trees.boots, fixed_order=tre.get_tip_labels()) boots.draw_cloudtree(orient='right', edge_style={"opacity": 0.05}); # #### *STRUCTURE* -- population cluster inference # In[ ]: ## conda install structure clumpp -c ipyrad # In[56]: ## create a structure analysis object for the no-outgroup data set struct = ipa.structure( name=min10.name, data=min10.outfiles.str, mapfile=min10.outfiles.snpsmap, ) ## set params for analysis (should be longer in real analyses) struct.mainparams.burnin=1000 struct.mainparams.numreps=8000 # In[57]: ## run structure across 10 random replicates of sampled unlinked SNPs for kpop in [2, 4, 6, 8]: struct.run(kpop=kpop, nreps=10, ipyclient=ipyclient) # In[59]: ## wait for all of these jobs to finish ipyclient.wait() # #### Use Clumpp to permute replicates # In[64]: ## these options make it run faster struct.clumppparams.m = 3 ## use largegreedy algorithm struct.clumppparams.greedy_option = 2 ## test nrepeat possible orders struct.clumppparams.repeats = 10000 ## number of repeats # In[65]: ## collect results tables = {} for kpop in [2, 4, 6, 8]: tables[kpop] = struct.get_clumpp_table(kpop) # #### Plot the results as a barplot # Usually a next step in a structure analysis would be do some kind of statistical analysis to compare models and identify K values that fit the data well. # In[66]: ## order of bars will be taken from ladderized tree above myorder = tre.get_tip_labels() # In[69]: ## import toyplot (packaged with toytree) import toyplot ## plot bars for each K-value (mean of 10 reps) for kpop in [2, 4, 6, 8]: table = tables[kpop] table = table.ix[myorder] ## plot barplot w/ hover canvas, axes, mark = toyplot.bars( table, title=[[i] for i in table.index.tolist()], width=400, height=200, yshow=False, style={"stroke": toyplot.color.near_black}, ) # #### *TREEMIX* -- ML tree & admixture co-inference # In[70]: ## conda install treemix -c ipyrad # In[93]: ## group taxa into 'populations' imap = { 'orient': ['Vidua_orientalis'], 'interj': ['Vidua_interjecta'], 'obtusa': ['Vidua_obtusa'], 'paradi': ['Vidua_paradisaea'], 'hypoch': ['Vidua_hypocherina'], 'macrou': ['Vidua_macroura_macroura', 'Vidua_macroura_arenosa'], 'fische': ['Vidua_fischeri'], 'regia' : ['Vidua_regia'], 'chalyb': ['Vidua_chalybeata_amauropteryx', 'Vidua_chalybeata_neumanni'], 'purpur': ['Vidua_purpurascens'], 'rarico': ['Vidua_raricola'], #'outgro': ['Anomalospiza_imberbis'], } ## optional: loci will be filtered if they do not have data for at ## least N samples in each species. Minimums cannot be <1. minmap = { 'orient': 1, 'interj': 1, 'obtusa': 1, 'paradi': 1, 'hypoch': 1, 'macrou': 2, 'fische': 1, 'regia' : 1, 'chalyb': 2, 'purpur': 1, 'rarico': 1, #'outgro': 1, } # In[104]: ## create a treemix analysis object tmix = ipa.treemix( name=min10.name, data=min10.outfiles.snpsphy, imap=imap, minmap=minmap, ) ## set params on treemix object tmix.params.m = 1 tmix.params.root = "interj,orient,paradi,obtusa" tmix.params.global_ = 1 # In[105]: ## you can simply write the input files and run them externally ## or, as we show below, use the .run() command to run them here. tmix.write_output_file() # In[108]: ## a dictionary for storing treemix objects tdict = {} ## iterate over values of m for rep in xrange(4): for mig in xrange(4): ## create new treemix object copy name = "mig-{}-rep-{}".format(mig, rep) tmp = tmix.copy(name) ## set params on new object tmp.params.m = mig ## run treemix analysis tmp.run() ## store the treemix object tdict[name] = tmp # In[110]: import toyplot import numpy as np canvas = toyplot.Canvas(width=800, height=1200) idx = 0 for mig in range(4): for rep in range(4): tmp = tdict["mig-{}-rep-{}".format(mig, rep)] ax = canvas.cartesian(grid=(4, 4, idx), padding=25, margin=(25, 50, 100, 25)) ax = tmp.draw(ax) idx += 1 # #### baba: D-statistic (ABBA-BABA) admixture inference/hypothesis testing # In[15]: ## create a baba analysis object bb = ipa.baba( data=min4.outfiles.loci, newick="analysis-raxml/RAxML_bestTree.vidua_min10" ) # In[16]: ## this will generate tests from the tree, using constraints. bb.generate_tests_from_tree( constraint_exact=False, constraint_dict={ "p4": ['Anomalospiza_imberbis'], 'p3': ['Vidua_macroura_macroura', 'Vidua_macroura_arenosa'], }) # In[17]: ## run inference and significance testing on tests bb.run(ipyclient=ipyclient) # In[18]: ## sorted results for the tests performed bb.results_table.sort_values(by="Z", ascending=False) # #### summary # The top results show greatest support for admixture between V. macroura and V. paradisaea, though the signal is not very strong (Z=2.7). # In[19]: ## the test that had the most significant result: (BABA) bb.tests[25] # In[20]: ## next best (ABBA) bb.tests[22]