#!/usr/bin/env python # coding: utf-8 # # Assembling simulated and empirical data # # This Jupyter notebook provides a completely reproducible record of all the assembly analyses for the ipyrad manuscript. In this notebook we assemble multiple data sets using five different software programs: ipyrad, pyrad, stacks, aftrRAD, and dDocent. We include executable code to download and install each program, to download each data set, and to run each data set through each program. # ### Jupyter notebook SSH Tunneling # This notebook was executed on the Yale farnam cluster with access to 40 compute cores on a single node. We performed local I/O in the notebook using SSH Tunneling as described in the ipyrad Documentation (http://ipyrad.readthedocs.io/HPC_Tunnel.html). # # ### Organize the directory structure # In[1]: import shutil import glob import sys import os # In[2]: ## Set the scratch dir for data analysis WORK_DIR = "/ysm-gpfs/scratch60/de243/manuscript-analysis" if not os.path.exists(WORK_DIR): os.makedirs(WORK_DIR) ## Set a local dir for new (locally) installed software SOFTWARE = os.path.realpath("./tmp-software") if not os.path.exists(SOFTWARE): os.makedirs(SOFTWARE) ## Make paths to data (we download data into these dirs later) EMPIRICAL_DATA_DIR = os.path.join(WORK_DIR, "empirical_data") if not os.path.exists(EMPIRICAL_DATA_DIR): os.makedirs(EMPIRICAL_DATA_DIR) SIMULATED_DATA_DIR = os.path.join(WORK_DIR, "simulated_data") if not os.path.exists(SIMULATED_DATA_DIR): os.makedirs(SIMULATED_DATA_DIR) # In[3]: ## create dirs for results from each software PACKAGES = ["ipyrad", "pyrad", "stacks", "ddocent"] rdirs = {} for pack in PACKAGES: rdir = os.path.join(WORK_DIR, "assembly-{}".format(pack)) if not os.path.exists(rdir): os.makedirs(rdir) rdirs[pack] = rdir ## create subdirectories for results for key, val in rdirs.iteritems(): ## make for sim and real data real = os.path.join(val, "REALDATA") sim = os.path.join(val, "SIMDATA") for idir in [real, sim]: if not os.path.exists(idir): os.makedirs(idir) ## extra subdirectories for stacks if key == "stacks": extras = [os.path.join(real, "gapped"), os.path.join(real, "ungapped"), os.path.join(real, "default")] for idir in extras: if not os.path.exists(idir): os.makedirs(idir) # In[4]: ## create simulated data directories SIMDATA = ["simno", "simlo", "simhi", "simla"] sdirs = {} for simd in SIMDATA: sdir = os.path.join(SIMULATED_DATA_DIR, simd) if not os.path.exists(sdir): os.makedirs(sdir) sdirs[simd] = sdir # ### Download the empirical RAD *Pedicularis* data set # # This is a RAD data set for 13 taxa from Eaton and Ree (2013) [open access link](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3739883/pdf/syt032.pdf). Here we grab the demultiplexed fastq data from a public Dropbox link, but the same data is also hosted on the NCBI SRA as SRP021469. # # In[12]: get_ipython().run_cell_magic('bash', '-s $EMPIRICAL_DATA_DIR --err stderr --out stdout', '\n## the location of the data \ndataloc="https://dl.dropboxusercontent.com/u/2538935/example_empirical_rad.tar.gz"\n\n## grab data from the public link\ncurl -LkO $dataloc \ntar -xvzf example_empirical_rad.tar.gz\nrm example_empirical_rad.tar.gz\n\n## move data to the EMPIRICAL_DATA_DIR\nmv ./example_empirical_rad/ $1/Pedicularis\n') # ### Download the empirical *Heliconius* reference genomeĀ¶ # Davey, John W., et al. "Major improvements to the Heliconius melpomene # genome assembly used to confirm 10 chromosome fusion events in 6 # million years of butterfly evolution." G3: Genes| Genomes| Genetics 6.3 # (2016): 695-708. # In[11]: get_ipython().run_cell_magic('bash', '-s $EMPIRICAL_DATA_DIR --err stderr --out stdout', '\n## location of the data\ndataloc="http://butterflygenome.org/sites/default/files/Hmel2-0_Release_20151013.tgz"\n\n## grab data from the public link \ncurl -LkO $dataloc\ntar -zxvf Hmel2-0_Release_20151013.tgz\nrm Hmel2-0_Release_20151013.tgz\n\n## move data to EMPIRICAL_DATA_DIR\nmv ./Hmel2/ $1/Heliconius\n') # # Install all the software # ### First install a new isolated miniconda directory # # We could create a separate environment in an existing conda installation, but this is simpler since it works for users whether they have conda installed or not, and we can easily remove all the software at the end when we are done by simply deleting the 'tmp-software' directory. So all of the work we do in this notebook will be done in a completely isolated software environment. # In[13]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE --err stderr --out stdout', '\n## Fetch the latest miniconda \nminiconda="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh"\ncurl -LkO $miniconda\n\n## Install the new miniconda silently into the \'software\' directory.\n## -b = batch mode; -f = force overwrite; -p = install dir\nbash Miniconda-latest-Linux-x86_64.sh -b -f -p $1\n\n## update conda, if this fails your other conda installation may be \n## getting in the way. Try clearing your ~/.condarc file.\n$1/bin/conda update -y conda\n\n## remove the install file\nrm Miniconda-latest-Linux-x86_64.sh \n') # In[23]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE', '## check installation\n$1/bin/conda --version\n') # ### Install ipyrad (Eaton & Overcast 2017) # In[18]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE --err stderr --out stdout', '## installs the latest release (silently)\n$1/bin/conda install -y -c ipyrad ipyrad\n') # In[22]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE', '## check installation\n$1/bin/ipyrad -v\n') # ### Install stacks (Catchen) # In[25]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE --err stderr --out stdout', '## installs the latest release (silently)\n$1/bin/conda install -y -c bioconda stacks\n') # In[28]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE', '## check installation\n$1/bin/ustacks -v\n') # ### Install ddocent (Puritz) # In[29]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE --err stderr --out stdout', '## install all of the ddocent reqs\n$1/bin/conda install -y -c conda-forge unzip \n$1/bin/conda install -y -c bioconda ddocent=2.2.4 \n') # In[42]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE --err stderr', '## when running dDocent you have to always set the tmp-software\n## directory to the top of your PATH\nexport PATH="$1/bin:$PATH"\n\n## check installation (no -v argument, print start of stdout)\n$1/bin/dDocent | head -n 1\n') # ### Install pyrad (Eaton 2014) # In[49]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE --err stderr --out stdout', '\n## Should be unnecessary because numpy and scipy already installed\n$1/bin/conda install numpy scipy\n\n## get muscle binary\nmuscle="http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz"\ncurl -LkO $muscle \ntar -xvzf muscle*.tar.gz \nmv muscle3.8.31_i86linux64 $1/bin/muscle\nrm muscle3.8.31_i86linux64.tar.gz\n\n## Download and install vsearch\nvsearch="https://github.com/torognes/vsearch/releases/download/v2.0.3/vsearch-2.0.3-linux-x86_64.tar.gz"\ncurl -LkO $vsearch\ntar xzf vsearch-2.0.3-linux-x86_64.tar.gz\nmv vsearch-2.0.3-linux-x86_64/bin/vsearch $1/bin/vsearch\nrm -r vsearch-2.0.3-linux-x86_64/ vsearch-2.0.3-linux-x86_64.tar.gz\n\n## Fetch pyrad source from git repository \nif [ ! -d ./pyrad-git ]; then\n git clone https://github.com/dereneaton/pyrad.git pyrad-git\nfi;\n\n## and install to conda using pip\ncd ./pyrad-git\n$1/bin/pip install -e . \ncd ..\n') # In[51]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE', '## check installation\n$1/bin/pyrad --version\n') # # Simulate data # We will use simrrls to generate some simulated RAD-seq data for testing. This is a program that was written by Deren Eaton and is available on github: github.com/dereneaton/simrrls.git. simrrls requires the python egglib module, which is a pain to install in full, but we only need the simulation aspects of it, which are fairly easy to install. See below. # In[57]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE --err stderr --out stdout', '\n## install gsl\n$1/bin/conda install gsl -y \n\n## fetch egglib cpp and mv into tmp-software/pkgs\neggcpp="http://mycor.nancy.inra.fr/egglib/releases/2.1.11/egglib-cpp-2.1.11.tar.gz"\ncurl -LkO $eggcpp\ntar -xzvf egglib-cpp-2.1.11.tar.gz \nmv egglib-cpp-2.1.11/ $1/pkgs/\n\n## install egglib-cpp\ncd $1/pkgx/egglib-cpp-2.1.11/\nsh ./configure --prefix=$1\nmake\nmake install \ncd -\n\n## fetch egglib py module and mv into tmp-software/pkgs\neggpy="http://mycor.nancy.inra.fr/egglib/releases/2.1.11/egglib-py-2.1.11.tar.gz"\ncurl -LkO $eggpy \ntar -xvzf egglib-py-2.1.11.tar.gz\nmv egglib-py-2.1.11/ $1/pkgs/\n\n## install egglib-py\ncd $1/pkgs/egglib-py-2.1.11/\n$1/bin/python setup.py build --prefix=$1\n$1/bin/python setup.py install --prefix=$1\ncd -\n\n## clone simrrls into tmp-software/pkgs/\nif [ ! -d $1/pkgs/simrrls ]; then\n git clone https://github.com/dereneaton/simrrls.git $1/pkgs/simrrls\nfi\n\n## install with tmp-software/bin/pip\ncd $1/pkgs/simrrls\n$1/bin/pip install -e .\ncd -\n') # In[68]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE', "## check installations\necho 'egglib' $($1/bin/egglib | grep Version)\n$1/bin/simrrls --version\n") # # ### Simulating different RAD-datasets # # Both pyRAD and stacks have undergone a lot of work since the original pyrad analysis. Because improvements have been made we want to test performance of all the current pipelines and be able to compare current to past performance. We'll follow the original pyRAD manuscript analysis (Eaton 2013) by simulating modest sized datasets with variable amounts of indels. We'll also simulate one much larger dataset. Also, because stacks has since included an option for handling gapped analysis we'll test both gapped and ungapped assembly. # # ### Tuning simrrls indel parameter # # The -I parameter for simrrls has changed since the initial pyrad manuscript, so the we had to explore new values for this parameter that will approximate the number of indels we are after. I figured out a way to run simrrls and pipe the output to muscle to get a quick idea of the indel variation for different params. This gives a good idea of how many indel bearing seqs are generated. # # In[12]: # simrrls -n 1 -L 1 -I 1 -r1 $RANDOM 2>&1 | \ # grep 0 -A 1 | \ # tr '@' '>' | \ # muscle | grep T | head -n 60 # In[13]: # for i in {1..50}; # do simrrls -n 1 -L 1 -I .05 -r1 $RANDOM 2>&1 | \ # grep 0 -A 1 | \ # tr '@' '>' | \ # muscle | \ # grep T | \ # head -n 40 >> rpt.txt; # done # grep "-" rpt.txt | wc -l # From experimentation: # # -I value -- %loci w/ indels # # 0.02 -- ~10% # 0.05 -- ~15% # 0.10 -- ~25% # # The simulated data will live in these directories: # # SIM_NO_DIR = WORK_DIR/simulated_data/simno # SIM_LO_DIR = WORK_DIR/simulated_data/simlo # SIM_HI_DIR = WORK_DIR/simulated_data/simhi # SIM_LARGE_DIR = WORK_DIR/simulated_data/simlarge # # Timing: # # 10K loci -- ~8MB -- ~ 2 minutes # 100K loci -- ~80MB -- ~ 20 minutes # # # ### Call *simrrls* to simulate data # In[70]: get_ipython().run_cell_magic('bash', '-s $SOFTWARE $SIMULATED_DATA_DIR', '\n## call simrrls (No INDELS)\n$1/bin/simrrls -o $2/simno/simno -ds 2 -L 10000 -I 0 \n\n## call simrrls (Low INDELS)\n$1/bin/simrrls -o $2/simlo/simlo -ds 2 -L 10000 -I 0.02\n\n## call simrrls (High INDELS) \n$1/bin/simrrls -o $2/simhi/simhi -ds 2 -L 10000 -I 0.05\n\n## call simrls on Large data set (this takes a few hours, sorry!)\n## (30x12=360 Individuals at 100K loci) = gzipped 2.2GB\n$1/bin/simrrls -o $2/simla/Big_i360_L100K -ds 0 -L 100000 -n 30\n') # # ### Demultiplex all four libraries # # We will start each analysis from the demultiplexed data files, since this is commonly what's available when data are downloaded from NCBI. We use ipyrad to demultiplex the data. # # In[ ]: import ipyrad as ip for name in ["simno", "simlo", "simhi", "simla"]: ## demultiplex the library data = ip.Assembly(name, quiet=True) pdir = os.path.join(SIMULATED_DATA_DIR, name) data.set_params("project_dir", pdir) data.set_params("raw_fastq_path", os.path.join(pdir, "*.gz")) data.set_params("barcodes_path", os.path.join(pdir, "*barcodes.txt")) data.run("1") print "demultiplexed fastq data written to: {}".format(data.dirs.fastqs) # In[ ]: print "demultiplexed fastq data written to: {}".format(data.dirs.fastqs) # ## Assemble simulated data sets # ### ipyrad : simulated data assembly # Results # Data set Cores Loci Time # simno 4 10000 13:58 # simlo 4 10000 15:17 # simhi 4 10000 13:58 # simla 4 100000 13:38 # simla 80 100000 ... # In[ ]: ## A function to run ipyrad on a given data set def run_ipyrad(name): ## create Assembly data = ip.Assembly(name) ## set data paths and params data.set_params("project_dir", os.path.join(SIMULATED_DATA_DIR, name)) data.set_params("sorted_fastq_path", os.path.join(SIMULATED_DATA_DIR, name, name+"_fastqs", "*.gz")) data.set_params('max_low_qual_bases', 4) data.set_params('filter_min_trim_len', 69) data.set_params('max_Ns_consens', (99,99)) data.set_params('max_Hs_consens', (99,99)) data.set_params('max_SNPs_locus', (100, 100)) data.set_params('min_samples_locus', 2) data.set_params('max_Indels_locus', (99,99)) data.set_params('max_shared_Hs_locus', 99) ## run ipyrad steps 1-7 data.run("1234567", show_cluster=True) return data # In[ ]: get_ipython().run_cell_magic('timeit', '-n1 -r1', '## this records time to run the code in this cell once\nrun_ipyrad("simno")\n') # In[ ]: get_ipython().run_cell_magic('timeit', '-n1 -r1', '## this records time to run the code in this cell once\nrun_ipyrad("simlo")\n') # In[ ]: get_ipython().run_cell_magic('timeit', '-n1 -r1', '## this records time to run the code in this cell once\nrun_ipyrad("simhi")\n') # In[ ]: get_ipython().run_cell_magic('timeit', '-n1 -r1', '## this records time to run the code in this cell once\nrun_ipyrad("simla")\n') # ### stacks: simulated data assembly # In[ ]: