#!/usr/bin/env python # coding: utf-8 # # Analysing simulated and empirical datasets with stacks, pyrad, and ipyrad # # This ipython notebook will provide a completely reproducible record of # all the analyses for the ipyrad manuscript. # # All analyses were performed on a 40 core Intel(R) Xeon(R) CPU E5-2680 v2 @ 2.80GHz # Silicon Mechanics compute node with 120GB of main memory running CentOS Linux release 7.1.1503. # # * Insert thanks to the CCNY scientific computing cluster folks here # # # ### Notebook Tunnels # Mostly notes for myself, since my cluster config is complicated. To get a connection to a notebook on a compute node I have to bounce of 2 different machines, a gateway to get on campus, then another gateway (head hpc node) so I can talk to the cluster. The ssh commands will spawn processes in the background to shuttle communication between the ports. Kinda fancy. # # On the cluster compute node (where you actually want to do the work) run: # # ipython notebook --no-browser --port=8888 & # # On the cluster head node run this to forward connections from the campus to a node inside the hpc infrastructure: # # ssh -N -f -L localhost:8888:localhost:8888 # # On your gateway box run: # # ssh -N -f -L localhost:8887:localhost:8888 @ # # # ## Prerequisites (download and install data and executables) # In[1]: import shutil import glob import sys import os ## Set the default directories for exec and data. WORK_DIR="/home/iovercast/manuscript-analysis/" EMPERICAL_DATA_DIR=os.path.join(WORK_DIR, "example_empirical_rad/") SIMULATION_DATA_DIR=os.path.join(WORK_DIR, "simulated_data") IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/") PYRAD_DIR=os.path.join(WORK_DIR, "pyrad/") STACKS_DIR=os.path.join(WORK_DIR, "stacks/") AFTRRAD_DIR=os.path.join(WORK_DIR, "aftrRAD/") DDOCENT_DIR=os.path.join(WORK_DIR, "dDocent/") ## (emprical data dir will be created for us when we untar it) for dir in [WORK_DIR, IPYRAD_DIR, PYRAD_DIR, STACKS_DIR, AFTRRAD_DIR, DDOCENT_DIR]: if not os.path.exists(dir): os.makedirs(dir) ## Empirical output directories IPYRAD_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/") PYRAD_OUTPUT=os.path.join(PYRAD_DIR, "REALDATA/") STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/") STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/") STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/") STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/") AFTRRAD_OUTPUT=os.path.join(AFTRRAD_DIR, "REALDATA") DDOCENT_OUTPUT=os.path.join(DDOCENT_DIR, "REALDATA") ## Make the empirical output directories if they don't already exist for dir in [IPYRAD_OUTPUT, PYRAD_OUTPUT, STACKS_OUTPUT,\ STACKS_GAP_OUT, STACKS_UNGAP_OUT, STACKS_DEFAULT_OUT, AFTRRAD_OUTPUT, DDOCENT_OUTPUT]: if not os.path.exists(dir): os.makedirs(dir) os.chdir(WORK_DIR) # In[4]: ### Fetch the pedicularis data ##curl grabs the data from a public dropbox url ## the curl command uses an upper-case o argument, not a zero. get_ipython().system('curl -LkO https://dl.dropboxusercontent.com/u/2538935/example_empirical_rad.tar.gz') ## the tar command decompresses the data directory get_ipython().system('tar -xvzf example_empirical_rad.tar.gz') # In[195]: ## The original Helocnius analysis from pyrad v3 ## Fetch the heliconius genome and rad data ## ## Davey, John W., et al. "Major improvements to the Heliconius melpomene ## genome assembly used to confirm 10 chromosome fusion events in 6 ## million years of butterfly evolution." G3: Genes| Genomes| Genetics 6.3 ## (2016): 695-708. get_ipython().system('curl -LkO http://butterflygenome.org/sites/default/files/Hmel2-0_Release_20160201.tgz') get_ipython().system('tar -xvzf Hmel2-0_Release_20160201.tgz') ## Several RAD datasets are available ## ## Heliconius Genome Consortium. (2012). Butterfly genome reveals ## promiscuous exchange of mimicry adaptations among species. ## Nature, 487(7405), 94-98. ## European Nucleotide Archive, Accession ERP000991 ## Heliconius melpomene melpomene x Heliconius melpomene rosina - ERP000993 ## And from the Davey et al 2016 ## European Nucleotide Archive (ENA), accession PRJEB11288 # ## Prereqs - Install ipyrad # # Full install details for all platforms are here: http://ipyrad.readthedocs.io/installation.html # In[69]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', '## Must always export the new miniconda path in each bash cell. We do this\n## so the analysis pipeline is totally self contained, it doesn\'t read to\n## or write from anywwhere but the WORK_DIR.\n## There\'s probably a magic way to do this but i can\'t figure it out\nexport PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"\n\n## Fetch the latest miniconda installer\nwget --quiet https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh\n\n## Install miniconda silently to the work directory.\n## -b means "batch" mode, -f is force overwrite and -p is the install dir\nbash Miniconda-latest-Linux-x86_64.sh -b -f -p $WORK_DIR/miniconda\n\nconda update -y conda ## updates conda\nconda install -y -c ipyrad ipyrad ## installs the latest release (silently `-y`)\n') # In[70]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', 'export PATH="$1/miniconda/bin:$PATH"\nwhich ipyrad\n') # ## Prereqs - Install pyrad # # In[71]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"\n\n## Fetch and install prerequisites. This takes a few minutes so be patient\n\n## Should be unnecessary because numpy and scipy already installed by conda\nconda install numpy scipy\nwget http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz\ntar -xvzf muscle*.tar.gz\n\n## This unpacks the muscle binary in the current working directory\n## You can test it to see if it runs:\n./muscle3.8.31_i86linux64 -h\n\n## Copy to miniconda/bin so it will be in your path\n## Maybe not technically \'correct\' but it\'ll work for our purposes\ncp muscle3.8.31_i86linux64 $WORK_DIR/miniconda/bin/muscle\n\n## Download and install vsearch\nwget https://github.com/torognes/vsearch/releases/download/v2.0.3/vsearch-2.0.3-linux-x86_64.tar.gz\ntar xzf vsearch-2.0.3-linux-x86_64.tar.gz\ncp vsearch-2.0.3-linux-x86_64/bin/vsearch $WORK_DIR/miniconda/bin/vsearch\n\n## Fetch pyrad source from the git repository\ngit clone https://github.com/dereneaton/pyrad.git\ncd pyrad\n\n## If you are using anaconda you can simply run setup.py to\n## install pyrad into your conda environment\npython setup.py install\n\nwhich muscle\nwhich vsearch\nwhich pyrad\n') # ### Prereqs - Download and install stacks # # **NB:** Stacks is trickier to build on OSX, I had to compile it on a different # machine and then scp the binaries to the box I was working on. # # Stacks is picky about where stuff installs to. If you don't have permission to install to /usr/local (most HPC systems) then you need to provide the `--prefix` argument to `./configure` # In[74]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"\ncd $WORK_DIR\n\n## 1.42 is the version that crashed during cstacks on the simdata. Julian fixed\n## the bug and pushed 1.43 which seems to work great.\n#wget http://catchenlab.life.illinois.edu/stacks/source/stacks-1.42.tar.gz\nwget http://catchenlab.life.illinois.edu/stacks/source/stacks-1.43.tar.gz\ntar -xvzf stacks-1.43.tar.gz\ncd stacks-1.43\n./configure --prefix=$WORK_DIR/miniconda\nmake\nmake install\n\ncd $WORK_DIR\nwhich process_radtags\n') # ### Prereqs - Download and install aftrRAD # # Lots of gnarly prereqs. Also, the `Genotype.pl` script prompts the user a couple times # to verify whether or not you want to remove individuals with lots of missing data. I # modified the script for the simulated data so it just passes so I can actually # automate testing. # In[338]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" -s "$AFTRRAD_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"; export "AFTRRAD_DIR=$2"\ncd $AFTRRAD_DIR\n\n## Get a copy of acana\nwget http://www.niehs.nih.gov/research/resources/assets/docs/acana_linux_x64tgz.tgz\ncp ACANA_DIR/ACANA .\ncp ACANA_DIR/dnaMatrix .\n\n## ... and mafft\nwget http://mafft.cbrc.jp/alignment/software/mafft-7.305-with-extensions-src.tgz\ntar -xvzf mafft-7.305-with-extensions-src.tgz\ncd mafft-7.305-with-extensions/core\n## This won\'t work on mac (`-i` flag acts different)\nsed -i \'s/PREFIX = \\/usr\\/local/PREFIX = \\/$WORK_DIR\\/miniconda/\' Makefile\nmake; make install\n\n## ... and R\n## R turns out to be a complete nightmare to install if you don\'t have root\n## using brew makes it easier, but still not v straightforward\n## Brew installing R takes a while so be patient (>10 minutes)\ncd $AFTRRAD_DIR\ngit clone https://github.com/Linuxbrew/brew.git\n./brew/bin/brew install homebrew/science/r\n## Copy the R binary to the miniconda bin dir so it will be available\ncp brew/bin/R $WORK_DIR/miniconda/bin/\n\n## Some hackish bullshit to get cpan working in the event you are _not_ root\n## which is most everybody\nwget -O- http://cpanmin.us | perl - -l ~/perl5 App::cpanminus local::lib\neval `perl -I ~/perl5/lib/perl5 -Mlocal::lib`\necho \'eval `perl -I $WORK_DIR/perl5/lib/perl5 -Mlocal::lib`\' >> $WORK_DIR/.profile\necho \'export MANPATH=$WORK_DIR/perl5/man:$MANPATH\' >> $WORK_DIR/.profile\nsource $WORK_DIR/.profile\ncpanm Parallel::ForkManager\n\n## Actually get aftrRAD\ngit clone https://github.com/mikesovic/AftrRAD.git\n') # ### Install dDocent # # dDocent installs a bunch of business into its working directory so you have to be # sure to update the PATH in a similar way to how we do it with miniconda, i.e.: # # %%bash -s "$WORK_DIR" -s "$DDOCENT_DIR" # export PATH="$2/:$1/miniconda/bin:$PATH"; # # From the dDocent docs: "Now if you are using a Mac computer, things get a little trickier." This part of the manu pipeline is only tested and known to work on linux. # # There is an install shell script and it tried to install lots of the deps, but several failed, which i had to install by hand (freebayes, bwa, samtools, seqtk, cd-hit-est). Drag. Freebayes turned out to be a nightmare of deps as well, so it took hours to figure out how to install. Ultimately I had to compile it on another machine and copy the binaries over. Also had to update my LD_LIBRARY_PATH to get vcflibs tools to work: # # export LD_LIBRARY_PATH=/home/iovercast/manuscript-analysis/dDocent/freebayes-src/vcflib/tabixpp/htslib/:$LD_LIBRARY_PATH # In[435]: #%%bash -s "$WORK_DIR" -s "$DDOCENT_DIR" #export PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"; export "DDOCENT_DIR=$2" os.chdir(WORK_DIR) #force = True force = "" if force: shutil.rmtree(DDOCENT_DIR) cmd = "git clone https://github.com/jpuritz/dDocent.git" get_ipython().system('$cmd') os.chdir("dDocent") ## This will run for a long time and install a bunch of binaries to the dDocent dir cmd = "sh install_dDocent_requirements " + DDOCENT_DIR get_ipython().system('$cmd') cmd = "chmod 777 {}/dDocent".format(DDOCENT_DIR) ## test cmd = "export PATH={}:$PATH; dDocent".format(DDOCENT_DIR) get_ipython().system('$cmd') # # Simulation Analysis # # We will use `simrrls` to generate some simulated RAD-seq data for testing. This is a program that was written by Deren Eaton and is available on github: github.com/dereneaton/simrrls.git. `simrrls` requires the python egglib module, which is somewhat painful to install. Short version: # * Install egglib (painful) # * Good directions: http://wjidea.github.io/2016/installEgglib.html # * Install simrrls # * git clone https://github.com/dereneaton/simrrls.git # * cd simrrls # * python setup.py install # In[ ]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"\ncd $WORK_DIR\n\n## This is the long and painful version of installing egglib.\n## This is what i did, it\'s not super guaranteed to work on \n## different platforms, but it should be at least a guide to follow.\n\n## no cmake on my hpc compute node, so we have to install it ourselves\n## grab the silent installer and tell it to write into the \n## miniconda dir, since we\'re already adding that to path\nwget https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh\nbash cmake-3.6.1-Linux-x86_64.sh --prefix=$WORK_DIR/miniconda --exclude-subdir\ncmake --version\n\n## Install bio++. Similar move here, we\'ll install to the miniconda\n## directory because we don\'t have any assurance of root on the box\nmkdir bpp\ncd bpp\nwget http://biopp.univ-montp2.fr/repos/sources/bpp-core-2.2.0.tar.gz\nwget http://biopp.univ-montp2.fr/repos/sources/bpp-seq-2.2.0.tar.gz\nwget http://biopp.univ-montp2.fr/repos/sources/bpp-popgen-2.2.0.tar.gz\ntar -xvzf bpp-core-2.2.0.tar.gz\ntar -xvzf bpp-seq-2.2.0.tar.gz\ntar -xvzf bpp-popgen-2.2.0.tar.gz\ncd bpp-core-2.2.0\ncmake -DCMAKE_INSTALL_PREFIX=$WORK_DIR/miniconda\nmake\nmake install\ncd ../bpp-seq-2.2.0\ncmake -DCMAKE_INSTALL_PREFIX=$WORK_DIR/miniconda\nmake\nmake install\ncd ../bpp-popgen-2.2.0\ncmake -DCMAKE_INSTALL_PREFIX=$WORK_DIR/miniconda\nmake\nmake install\n\n## install egglib-cpp and egglib python module\n## NB: simrrls requires egglib 2 \nwget http://mycor.nancy.inra.fr/egglib/releases/2.1.11/egglib-cpp-2.1.11.tar.gz\nwget http://mycor.nancy.inra.fr/egglib/releases/2.1.11/egglib-py-2.1.11.tar.gz\ntar -xvzf egglib-cpp-2.1.11.tar.gz\ntar -xvzf egglib-py-2.1.11.tar.gz\ncd egglib-cpp-2.1.11\n./configure --prefix=$WORK_DIR/miniconda\nmake\nmake install\ncd ../egglib-py-2.1.11/\npython setup.py build --prefix=$WORK_DIR/miniconda\npython setup.py install --prefix=$WORK_DIR/miniconda\n\n## Install simrrls\ngit clone https://github.com/dereneaton/simrrls.git\ncd simrrls\npython setup.py install\nsimrrls --help\n') # ## Simulating different RAD-datasets # Both pyRAD and stacks have undergone a lot of work since the original pyrad analysis. Because improvements have been made we want to test performance of all the current pipelines and be able to compare current to past performance. We'll follow the original pyRAD manuscript analysis (Eaton 2013) by simulating modest sized datasets with variable amounts of indels. We'll also simulate one much larger dataset. Also, because stacks has since included an option for handling gapped analysis we'll test both gapped and ungapped assembly. # # ### Tuning simrrls indel parameter # The `-I` parameter for simrrls has changed since the initial pyrad manuscript, so the we had to explore new values for this parameter that will approximate the number of indels we are after. I figured out a way to run simrrls and pipe the output to muscle to get a quick idea of the indel variation for different params: # # simrrls -n 1 -L 1 -I 1 -r1 $RANDOM 2>&1 | grep 0 -A 1 | tr '@' '>' | muscle | grep T | head -n 60 # # If you run it like this then you can get an idea of how many indel bearing seqs are generated: # # for i in {1..50}; do simrrls -n 1 -L 1 -I .05 -r1 $RANDOM 2>&1 | grep 0 -A 1 | tr '@' '>' | muscle | grep T | head -n 40 >> rpt.txt; done # grep "-" rpt.txt | wc -l # # From experimentation: # # -I value -- %loci w/ indels # * 0.02 -- ~10% # * 0.05 -- ~15% # * 0.10 -- ~25% # # The simulated data will live in these directories: # * SIM_NO_DIR = WORK_DIR/simulated_data/simno # * SIM_LO_DIR = WORK_DIR/simulated_data/simlo # * SIM_HI_DIR = WORK_DIR/simulated_data/simhi # * SIM_LARGE_DIR = WORK_DIR/simulated_data/simlarge # # Timing: # * 10K loci -- ~8MB -- ~ 2 minutes # * 100K loci -- ~80MB -- ~ 20 minutes # In[326]: import subprocess import shutil force = True ## Directories for the simulation data SIM_NO_DIR=os.path.join(SIMULATION_DATA_DIR, "simno") SIM_LO_DIR=os.path.join(SIMULATION_DATA_DIR, "simlo") SIM_HI_DIR=os.path.join(SIMULATION_DATA_DIR, "simhi") SIM_LARGE_DIR=os.path.join(SIMULATION_DATA_DIR, "simlarge") for dir in [SIMULATION_DATA_DIR, SIM_NO_DIR, SIM_LO_DIR,\ SIM_HI_DIR, SIM_LARGE_DIR]: if force and os.path.exists(dir): shutil.rmtree(dir) if not os.path.exists(dir): os.makedirs(dir) ## Tree input doesn't work right. Egglib errors. ## These are the trees from pyrad v3 manu, not sure if they ever worked there either # newick = "(((((A:2,B:2):2,C:4):4,D:8):4,(((E:2,F:2):2,G:4):4,H:8):4):4,(((I:2,J:2):2,K:4):4,L:8):8,X:16):16;" newick = "(((((A,B),C),D),(((E,F),G),H)),(((I,J),K),L));" treefile = os.path.join(SIMULATION_DATA_DIR, "simtree.txt") with open(treefile, 'w') as outfile: outfile.write(newick) cmd = """\ export PATH={workdir}/miniconda/bin:$PATH; time simrrls -o {odir}/{oname} -ds 2 -L 10000 -I {indels} -r1 1 """ ## simulate no indels call = cmd.format(workdir=WORK_DIR, odir=SIMULATION_DATA_DIR, oname="simno/simno", indels=0, tree=treefile) print(call) output = subprocess.check_output(call, shell=True, stderr=subprocess.STDOUT) print(output) ## simulate low indels call = cmd.format(workdir=WORK_DIR, odir=SIMULATION_DATA_DIR, oname="simlo/simlo", indels=0.02) print(call) output = subprocess.check_output(call, shell=True, stderr=subprocess.STDOUT) print(output) ## simulate high indels call = cmd.format(workdir=WORK_DIR, odir=SIMULATION_DATA_DIR, oname="simhi/simhi", indels=0.05) print(call) output = subprocess.check_output(call, shell=True, stderr=subprocess.STDOUT) print(output) ## simulate large dataset, no indels but w/ allele dropout from mutations to cutsites call = "export PATH={workdir}/miniconda/bin:$PATH; time simrrls -o {odir}/{oname} -ds 2 -L 100000 -mc 1 -r1 1"\ .format(workdir=WORK_DIR, odir=SIMULATION_DATA_DIR, oname="simlarge/simlarge") print(call) #output = subprocess.check_output(call, shell=True, stderr=subprocess.STDOUT) print(output) # ## ipyrad simulated data assembly # # For simulated datasets with 10000 l00bp loci ipyrad runs in # * v.0.3.41 ~6 minutes. # * v.0.3.42 ~8 minutes. # * v.0.4.0 ~4 minutes. # * v.0.5.10 ~4 minutes. # # Large dataset (100000 100bp loci) # * v.0.4.0 ~24 minutes. # In[2]: import ipyrad as ip reload(ip) ip.__version__ # In[3]: import ipyrad as ip ## Set up directory structures. change the force flag if you want to ## blow everything away and restart #force = True force = "" IPYRAD_SIMOUT=os.path.join(IPYRAD_DIR, "SIMDATA/") if force and os.path.exists(IPYRAD_SIMOUT): shutil.rmtree(IPYRAD_SIMOUT) if not os.path.exists(IPYRAD_SIMOUT): os.makedirs(IPYRAD_SIMOUT) for outdir in ["no/", "lo/", "hi/", "large/"]: tmpdir = os.path.join(IPYRAD_SIMOUT, "sim"+outdir) if not os.path.exists(tmpdir): os.makedirs(tmpdir) ## go to the ipyrad simulated data output directory #for dir in ["no", "lo", "hi", "large"]: for dir in ["hi"]: ## Set barcode and raw data files simout_dir = os.path.join(IPYRAD_SIMOUT, "sim"+dir) simdata_dir = os.path.join(SIMULATION_DATA_DIR, "sim"+dir) bcodes = os.path.join(simdata_dir, "sim{}_barcodes.txt".format(dir)) fastqs = os.path.join(simdata_dir, "sim{}_R1_.fastq.gz".format(dir)) os.chdir(simout_dir) ## Make a new assembly data = ip.Assembly("sim"+dir) ## Set padata1.set_params('project_dir', "./test_rad") data.set_params('raw_fastq_path', fastqs) data.set_params('barcodes_path', bcodes) data.set_params('max_low_qual_bases', 4) data.set_params('filter_min_trim_len', 69) data.set_params('max_Ns_consens', (99,99)) data.set_params('max_Hs_consens', (99,99)) data.set_params('max_SNPs_locus', (100, 100)) data.set_params('min_samples_locus', 2) data.set_params('max_Indels_locus', (99,99)) data.set_params('max_shared_Hs_locus', 99) data.set_params('trim_overhang', (2,2,2,2)) data.write_params(force=True) cmd = "ipyrad -p params-sim{}.txt -s 1234567 --force".format(dir) print(cmd) get_ipython().system('time $cmd') # ## pyrad simulated data assembly # Simulated assembly approximate runtimes: # # * 10000 l00bp loci pyrad runs in ~11 minutes. # * 100000 100bp loci pyrad runs in ~90 minutes. # ### Set the default pyrad params file. # In[8]: ## Here we load the default params info into `pyrad_params`. Each simulation ## iteration will slightly modify this and write it out for their own use. ## I zipped and uuencoded the params file so it wouldn't take up so much visual room. Lol pyrad_params = 'begin 666 \nM>)R-5^]OVCH4_$SNS30?3\nM^^/?=9P *Q >\'TH@^/CF^)QS;[-L-(*::EIQRS4(62^L@9G24*_NKJ[AF6LC\nME(0TC,))"C :90=>0&0Y8-PM.[CU=?KJ]^7,&!UZM7$(?PC])/\nM0LZ!"8T 2J\\._7S]&M*R# 8[6$D(7U0>@IJ!5/*$\\6I16E&7?(E5S43)#0R%\nMNV>A%))#?!$@EHGW0*4;J$>J<\\5X [!G_799#NK9<%Q0[\'_:LQ!R5554,A@Z\nMCJDM K *4C@P[]^]\nM+HR3^PLRSH@9!X.S/N1+K/C[@I8(2Y?P"HRPJ*K?PA;P"[^&=Y!$S0$8SKU(\nMDB@8.B;"B_%AV#A"&S153;OBG"-LH;DI5,F &BR;\\5Q4N,FZ8*\\+35D/,AKL\nMFEIJ5S6? OZ4S!\\-J:G0W3MC[MNAL@5Z?>KJ9BHW@:/ZAGS[*A\nM5>U,AI<4[2*QU%+E"_."9A1):>"1E^HWGI(P_JP/9>-V3.[F9 \\3XQ"W#*>HIA+7@%D\\&HY%ULTC\nMC$#)4_HOSZ#D3ES9/=B3%ILR=H*^\'B(#.!/6G(X>[N*\'T>$"+MH"\nMRC;$&3_9;@8,\';%_<>/5GB>[;(%1F5^GF%Z5,!6U>>%EW_8*3#+&9QG$P3;P\nM3I_9!DZB%K@NG,:_FUQICI7/F@-OT-)T;RP>J3B)6V#L7^B;*3BL*&M2#&+_\nM_H8R6N--$T*2^:C_\'\\!))R \\4*&T@(_DIBTU"J,H)O@W)M!V3&PA&%ZH^P"!\nMQ[W Z8;C6Y\\!MVVHY$H:%/POO\\WX)1_\'@,\\VP&VX^"SYLYJ\';90WS6)GDV/ \nMG4_K4@FV\\M!-(]N\'F+Q=A^RZZ?1@3S9%W]]^,U,/$D=1@%F&^8UJZ;XA[MLM\nM-GI-FIQO@#]CLRP1VL6LD"=-"R(TU\\H8_Z\'9(H247%X&Z[&A![OSG\\:!1%4@\nM%]4C;YHB:Y\'B.$EV!.T[VC\'LSH*HTVHSCI1\\9HD6\\P(C1FZZCR!.\\,.(1($G\nMY/DPOO=,N&V*MH_H06^>-&8&F#AM7,W1J.47<5I_)-,KJ_[@;67N"U\nM#86\\YIK7FS[!W,>VV[_+,%YS50MN_&[Q4>#.AC@]0R%PXL,16B#3VS-5PV\\6\nM$1"KXWAH%35/&ZPX>\\T0_C7@_U"%8@2B3*HUL)_1>K [\nM&W;COZ/;#7[N &L\\M9_JL=UH\\A8VDMM@_S6:8,MS7;;IY*=Y29%4:HR8RXI+\n/NSNQ;&EX=U89_ >8=H2-\n \nend\n' pyrad_params = pyrad_params.decode("uu").decode("zlib") print(pyrad_params) ## If you want to make changes to the params file you can generate a new uuencoded string ## by editing the pyrad/params-pyrad.txt file and running these lines. If you run this ## at a terminal inside python shell then it'll print out nicely as one big fat line. #with open(os.path.join(PYRAD_DIR, "params-pyrad.txt")) as infile: # infile.read().encode("zlib").encode("uu") # In[9]: import subprocess ## Set up directory structures. change the force flag if you want to ## blow everything away and restart # force = True force = "" PYRAD_SIMOUT=os.path.join(PYRAD_DIR, "SIMDATA/") if force and os.path.exists(PYRAD_SIMOUT): shutil.rmtree(PYRAD_SIMOUT) if not os.path.exists(PYRAD_SIMOUT): os.makedirs(PYRAD_SIMOUT) for outdir in ["no/", "lo/", "hi/", "large/"]: tmpdir = os.path.join(PYRAD_SIMOUT, "sim"+outdir) if not os.path.exists(tmpdir): os.makedirs(tmpdir) ## go to the pyrad simulated data output directory for dir in ["no", "lo", "hi", "large"]: ## Set barcode and raw data files simdata_dir = os.path.join(SIMULATION_DATA_DIR, "sim"+dir) bcodes = os.path.join(simdata_dir, "sim{}_barcodes.txt".format(dir)) fastqs = os.path.join(simdata_dir, "sim{}_R1_.fastq.gz".format(dir)) ## Set output directory and params file simout_dir = os.path.join(PYRAD_SIMOUT, "sim"+dir) my_params_file = os.path.join(simout_dir, "params{}-pyrad.txt".format(dir)) ## Grab the generic pyrad params file and modify it for this run my_params = pyrad_params.replace("", simout_dir) my_params = my_params.replace("", fastqs) my_params = my_params.replace("", bcodes) ## Write the params out to a file with open(my_params_file, 'w') as outfile: outfile.write(my_params) cmd = "export PATH={workdir}/miniconda/bin:$PATH; ".format(workdir=WORK_DIR) cmd += "time pyrad -p {params_file} -s 1234567".format(params_file=my_params_file) ## Good sub for testing #cmd += "time pyrad --version" print(cmd) output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) print(output) # ## stacks simulated data assembly # Output for stacks will be a bit trickier bcz we'll run gapped, ungapped and default on all sim data # ``` # WORK_DIR # |____stacks # |___SIMOUT # |___gapped # |____simno # |____simlo # |____simhi # |____ungapped # | # |____default # ``` # # We'll make calls to stacks that will look kind of like this: # ``` # * process_radtags args # * -f input file # * -i gzfastq # * -b barcodes file # * -e pstI # * -o output dir # # process_radtags -f simno_R1_.fastq.gz -i gzfastq -b simno_barcodes.txt -e pstI -o simout # # * denovo_map.pl args # * -m 2: Minimum depth to make a stack # * -M 10: Number of mismatches per locus within individuals # * -N 10: Number of allowed mismatches in secondary reads # * -n 10: Number of mismatches when clustering across individuals # # Housekeeping flags: # * -T 40: Number of threads # * -b 1: Batch id (Can be anything) # * -S: Disable sql # # denovo_map.pl -m 2 -M 10 -N 10 -n 10 -T 40 -b 1 -S --gapped -X 'populations:--vcf --genepop --structure --phase --fastphase --phylip ' -X 'populations:-m 6' -X 'ustacks:--max_locus_stacks 2' -o /home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/ -s # # Times: # # * Demux small - 15 seconds # * Demux large - 2m30s # * denovo_map small - # * denovo_map large - 23m # In[335]: ## These are the python variables for the main stacks simout paths: ## STACKS_GAP_SIMOUT, STACKS_UNGAP_SIMOUT, STACKS_DEFAULT_SIMOUT STACKS_SIMOUT=os.path.join(STACKS_DIR, "SIMDATA/") STACKS_GAP_SIMOUT=os.path.join(STACKS_SIMOUT, "gapped/") STACKS_UNGAP_SIMOUT=os.path.join(STACKS_SIMOUT, "ungapped/") STACKS_DEFAULT_SIMOUT=os.path.join(STACKS_SIMOUT, "default/") force = True #force = "" if force and os.path.exists(STACKS_SIMOUT): shutil.rmtree(STACKS_SIMOUT) if not os.path.exists(STACKS_SIMOUT): os.makedirs(STACKS_SIMOUT) ## Make the sumulation output directories if they don't already exist for dir in [STACKS_GAP_SIMOUT, STACKS_UNGAP_SIMOUT, STACKS_DEFAULT_SIMOUT]: if not os.path.exists(dir): os.makedirs(dir) for outdir in ["no/", "lo/", "hi/", "large/"]: tmpdir = os.path.join(dir, "sim"+outdir) if not os.path.exists(tmpdir): os.makedirs(tmpdir) ## Do the stacks sim analysis for dir in [STACKS_GAP_SIMOUT, STACKS_UNGAP_SIMOUT, STACKS_DEFAULT_SIMOUT]: for sim in ["no", "lo", "hi", "large"]: print("##################################") print("Doing - " + dir + " - sim" + sim) print("##################################") ## Where simulation results will be written simout_dir = os.path.join(dir, "sim"+sim) ## Where the rawdata and barcodes live simdata_dir = os.path.join(SIMULATION_DATA_DIR, "sim"+sim) ## We can just use the sim barcodes file as the stacks population map ## because all it really is doing is reading the first column, which ## is the names of samples to use in denovo_map.pl ipyrad_bcodes = os.path.join(simdata_dir, "sim{}_barcodes.txt".format(sim)) fastqs = os.path.join(simdata_dir, "sim{}_R1_.fastq.gz".format(sim)) ## Munge the ipyrad barcodes file into the format stacks wants for the population map ## file (ie sample name then population). Here we just assign all samples to be from ## population 1 popmap = os.path.join(simout_dir, "stacks_popmap.txt") with open(ipyrad_bcodes) as infile: with open(popmap, 'w') as outfile: lines = infile.readlines() for line in lines: ## This is just setting all samples to population 1 outfile.write(line.split()[0]+"\t1\n") ## Munge the sim barcodes file into the format that stacks wants ## basically the inverse of what ipyrad uses barcode first then sample name ## There is probably a fancier/smarter way to do this, but this works. bcodes = os.path.join(simout_dir, "stacks_barcodes.txt") with open(ipyrad_bcodes, 'r') as infile: with open(bcodes, 'w') as outfile: lines = infile.readlines() for line in lines: if line: outfile.write("\t".join(line.split()[::-1])+"\n") ## Export the path so we can actually find the binaries cmd = "export PATH={workdir}/miniconda/bin:$PATH; ".format(workdir=WORK_DIR) ## Add the command to process_radtags. ## Yes we don't have to do this over and over, but it's fast, and also ipyrad/pyrad ## are paying the same demultiplex penalty for every run. cmd += "time process_radtags -f {simdata} -i gzfastq -b {barcodes} -e pstI -o {simout}; "\ .format(simdata=fastqs, barcodes=bcodes, simout=simout_dir) ## Toggle the dryrun flag for testing #DRYRUN="-d" DRYRUN="" ## Don't add all the fancy stuff if you just want to do the default assembly ADDITIONAL_PARAMS = "" GAPPED = "" if not dir == STACKS_DEFAULT_SIMOUT: ADDITIONAL_PARAMS = " -m 2 -M 10 -N 10 -n 10 " if dir == STACKS_GAP_SIMOUT: ADDITIONAL_PARAMS += " --gapped " OUTPUT_FORMATS = "--vcf --genepop --structure --phase --fastphase --phylip " cmd += "time denovo_map.pl -T 40 -b 1 -S " + DRYRUN + ADDITIONAL_PARAMS\ + " -X \'populations:" + OUTPUT_FORMATS + "\'"\ + " -o " + simout_dir + " --samples " + simout_dir + " -O " + popmap print(cmd) output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) print(output) # ## aftrRAD simulated data assembly # aftrRAD throws out any read with one or more bases less than # the phred qscore `minQual` setting. You can't really tune how many # low quality bases to retain. For the simulated data we are simulating # all qscores arbitrarily high, so this isn't a problem. For # empirical we might have to think about lowering minQual... # # Actually, throwing out many reads seems to be the aftrRAD strategy. I think # its trying to reduce the number of potential reads to match (reduce singletons # from error) because it # # Performance on 10000 100bp simulated dataset: # * ~200 minutes # # Performance on 100000 100bp simulated dataset: # * AftrRad.pl - ~4500 minutes # * Genotype.pl - 13 days (my notebook crashed, but it finished in about this long) # In[ ]: import subprocess import gzip ## Set up directory structures. change the force flag if you want to ## blow everything away and restart # force = True force = "" AFTRRAD_SIMOUT=os.path.join(AFTRRAD_DIR, "SIMDATA/") if force and os.path.exists(AFTRRAD_SIMOUT): shutil.rmtree(AFTRRAD_SIMOUT) if not os.path.exists(AFTRRAD_SIMOUT): os.makedirs(AFTRRAD_SIMOUT) for outdir in ["no/", "lo/", "hi/", "large/"]: tmpdir = os.path.join(AFTRRAD_SIMOUT, "sim"+outdir) if not os.path.exists(tmpdir): os.makedirs(tmpdir) ## go to the pyrad simulated data output directory for dir in ["no", "lo", "hi", "large"]: print("Doing - {}".format(dir)) ## Set barcode and raw data files simdata_dir = os.path.join(SIMULATION_DATA_DIR, "sim"+dir) ipyrad_bcodes = os.path.join(simdata_dir, "sim{}_barcodes.txt".format(dir)) fastqs = os.path.join(simdata_dir, "sim{}_R1_.fastq.gz".format(dir)) simout_dir = os.path.join(AFTRRAD_SIMOUT, "sim"+dir) if not os.path.exists(simout_dir): os.mkdir(simout_dir) bcodes = os.path.join(simout_dir, "aftrrad-{}-barcodes.txt".format(dir)) ## Munge the ipyrad barcodes file to the format aftrrad wants ## (barcode\tsample name) with open(ipyrad_bcodes, 'r') as infile: with open(bcodes, 'w') as outfile: lines = infile.readlines() for line in lines: if line: outfile.write("\t".join(line.split()[::-1])+"\n") ## Make data and barcodes directories data_dir = os.path.join(simout_dir, "Data") bcodes_dir = os.path.join(simout_dir, "Barcodes") for tmp in [data_dir, bcodes_dir]: if not os.path.exists(tmp): os.mkdir(tmp) ## Copy the fastq to the aftrrad/Data dir and the barcodes to the ## aftrrad/Barcodes dir. These files have to be in these directories ## and they have to have the exact same name, derp. aftrrad_fastqs = os.path.join(data_dir, "sim-{}.txt".format(dir)) with gzip.open(fastqs, 'rb') as f_in, open(aftrrad_fastqs, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) shutil.copy2(bcodes, os.path.join(bcodes_dir, "sim-{}.txt".format(dir))) ## AfterRAD is incredibly picky about where scripts and binaries are so you ## have to do a bunch of annoying housekeeping. cmd = "cp -rf {}/* {}".format(os.path.join(AFTRRAD_DIR, "AftrRAD/AftrRADv5.0"), simout_dir) os.system(cmd) shutil.copy2(os.path.join(AFTRRAD_DIR, "dnaMatrix"), simout_dir) shutil.copy2(os.path.join(AFTRRAD_DIR, "ACANA"), simout_dir) ## This first line of nonsense is to make it so perl can find the Parallel::ForkManager library base = "export PATH={}perl5/bin:$PATH; cpanm --local-lib={}/perl5 local::lib && eval $(perl -I {}/perl5/lib/perl5/ -Mlocal::lib); ".format(WORK_DIR, WORK_DIR, WORK_DIR) ## Good sub for testing cmd = base + "time perl AftrRAD.pl -h" ## numIndels-3 What do here? ## stringLength-99 We don't care about homoplymers here, so set it very high ## maxH-90 ?? ## AftrRAD manu analysis settings - P2: noP2; minDepth: 5; numIndel: 3; MinReads: 5 cmd = base + "time perl AftrRAD.pl re-TGCAG minDepth-6 P2-noP2 minIden-85 " \ + "stringLength-99" \ + "DataPath-Data/ BarcodePath-Barcodes/ maxProcesses-40" os.chdir(simout_dir) print(cmd) output = "" try: output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) pass except Exception as inst: print(inst.output) print(output) ## MinReads is mindepth_statistical cmd = base + "time perl Genotype.pl MinReads-6 subset-0 maxProcesses-40" #cmd = base + "perl Genotype.pl -h" print(cmd) output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) print(output) ## pctScored is min_samples_locus as a percent of total samples, here since ## we use 2 for min_samples_locus and there are 12 simulated samples we use 17% cmd = base + "time perl FilterSNPs.pl pctScored-17" print(cmd) output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) print(output) # ### Post-processing aftrrad # aftrrad doesn't provide vcf as output so we have to make it ourselves # # Ooof. The OutputFasta.pl file has a SNPsOnly flag, but it doesn't output an alignment, so you can't use it. Also snp-sites doesn't convert ambiguity codes, so aftrRAD post processing is totally not writing out a good vcf. # In[ ]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" -s "$AFTRRAD_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"; export "AFTRRAD_DIR=$2"\n\ncd $AFTRRAD_DIR\n./brew/bin/brew tap homebrew/science\n./brew/bin/brew install snp-sites\n\n## Copy the R binary to the miniconda bin dir so it will be available\ncp brew/Cellar/snp-sites/2.2.0/bin/snp-sites $WORK_DIR/miniconda/bin/\n\ncd SIMDATA\nfor d in sim*;\ndo\n echo $d\n cd $d/Formatting\n perl OutputFasta.pl SNPsOnly-1 ambig-1\n snp-sites -v -o $d.vcf SNPMatrix_*.All.fasta\ndone\n') # ### More post processing for aftrRAD # The weird way snp-sites codes missing data is actually as a ref or alt # allele so we have to convert all the indel variants to actually look # like this './.' to align with other vcf formats for the results analysis. # In[ ]: import os import shutil AFTRRAD_SIMOUT=os.path.join(AFTRRAD_DIR, "SIMDATA/") os.chdir(AFTRRAD_SIMOUT) for outdir in ["simno", "simlo", "simhi", "simlarge"]: myvcf = "{}/Formatting/{}.vcf".format(outdir, outdir) print("Doing - myvcf") shutil.copy2(myvcf, myvcf+".bak") f = open(myvcf) lines = f.readlines() newlines = [] for line in lines: if "#" in line: newlines.append(line.strip()) continue dat = line.split() ref = dat[3] alt2 = "" try: alt2 = dat[4].split(",")[1] except: pass alt1 = dat[4].split(",")[0] if "*" in [ref, alt1, alt2]: idx = [ref, alt1, alt2].index("*") for i, val in enumerate(dat[9:]): if int(val) == idx: dat[9+i] = "./." #print(dat) newlines.append("\t".join(dat)) outfile = open(myvcf, 'w') outfile.write("\n".join(newlines)) # ## dDocent simulated data assembly # Simulated assembly approximate runtimes: # # * 10000 l00bp loci ddocent runs in ~2 minutes. # * 100000 100bp loci pyrad runs in ~15 minutes. # # ~~This would be what you would do if dDocent actually could run unsupervised, but it REQUIRES # you to sit there and peck at a couple keys, so this part of the script just sets up the # input files and stuff for you, you're responsible for running this by hand in each sim directory.~~ # # Actually nevermind, I found the two sections of the dDocent script that prompt for user input. These map more or less to mindepth_statistical and min_samples_locus. Hard coded two params on dDocent shell script lines 546 and 585 to set values of CUTOFF=6 and CUTOFF2=2, to align w/ these ipyrad settings. # # Actually nevermind, it's still doing some kind of pipe to stdout that's broken running inside the notebook, so still run these by hand. I'm leaving the hardcoded values tho. # # Also, on line 379 there's a hardcoded cutoff for final output where it only retains snps called in 90% of individuals. # echo "Using VCFtools to parse SNPS.vcf for SNPs that are called in at least 90% of individuals" # vcftools --vcf TotalRawSNPs.vcf --geno 0.9 --out Final --counts --recode --non-ref-af 0.001 --max-non-ref-af 0.9999 --mac 1 --minQ 30 --recode-INFO-all &>VCFtools.log # # Also, the final vcf files that are output contain complex genotypes rather than just snps (this is the default for freebayes), so you have to run some extra code-jams to get just # the snps (vcfallelicprimitives and then vcftools --remove-indels # In[ ]: import ipyrad as ip import subprocess import glob ## Set up directory structures. change the force flag if you want to ## blow everything away and restart # force = True force = "" DDOCENT_SIMOUT=os.path.join(DDOCENT_DIR, "SIMDATA/") if force and os.path.exists(DDOCENT_SIMOUT): shutil.rmtree(DDOCENT_SIMOUT) if not os.path.exists(DDOCENT_SIMOUT): os.makedirs(DDOCENT_SIMOUT) for outdir in ["no/", "lo/", "hi/", "large/"]: tmpdir = os.path.join(DDOCENT_SIMOUT, "sim"+outdir) if not os.path.exists(tmpdir): os.makedirs(tmpdir) ## go to the pyrad simulated data output directory for dir in ["no", "lo", "hi", "large"]: print("Doing - {}".format(dir)) ## Set barcode and raw data files simdata_dir = os.path.join(SIMULATION_DATA_DIR, "sim"+dir) bcodes = os.path.join(simdata_dir, "sim{}_barcodes.txt".format(dir)) fastqs = os.path.join(simdata_dir, "sim{}_R1_.fastq.gz".format(dir)) simout_dir = os.path.join(DDOCENT_SIMOUT, "sim"+dir) if not os.path.exists(simout_dir): os.mkdir(simout_dir) os.chdir(simout_dir) ## dDocent doesn't do demultiplexing, so we'll use ipyrad for it real quick data = ip.Assembly("sim"+dir) data.set_params('raw_fastq_path', fastqs) data.set_params('barcodes_path', bcodes) data.write_params(force=True) cmd = "ipyrad -p params-sim{}.txt -s 1".format(dir) print(cmd) get_ipython().system('time $cmd') ## Now we have to rename all the files in the way dDocent expects them: ## 1A_0_R1_.fastq.gz -> Pop1_Sample1.F.fq.gz ## ## Have to cd to the directory cuz we have to be in the data dir when we run dDocent anyway os.chdir("sim{}_fastqs".format(dir)) fq_files = glob.glob("./*.fastq.gz") for f in fq_files: newname = "Pop{}_Sample{}.F.fq.gz".format(f[2], f[3]) print("Renaming {} -> {}".format(f, newname)) os.rename(f, newname) ## Write out the config file for this run. ## Compacted the config file into one long line here to make it not take up so much room config_file = "sim{}-config.txt".format(dir) with open(config_file, 'w') as outfile: outfile.write('Number of Processors\n40\nMaximum Memory\n0\nTrimming\nyes\nAssembly?\nyes\nType_of_Assembly\nSE\nClustering_Similarity%\n0.85\nMapping_Reads?\nyes\nMapping_Match_Value\n1\nMapping_MisMatch_Value\n3\nMapping_GapOpen_Penalty\n5\nCalling_SNPs?\nyes\nEmail\nwatdo@mailinator.com\n') cmd = "export LD_LIBRARY_PATH={}/freebayes-src/vcflib/tabixpp/htslib/:$LD_LIBRARY_PATH; ".format(DDOCENT_DIR) cmd += "export PATH={}:$PATH; time dDocent {}".format(DDOCENT_DIR, config_file) print(cmd) #!$cmd ## You have to post-process the vcf files to decompose complex genotypes ## and remove indels fullvcf = simdata_dir + "TotalRawSNPs.vcf" filtvcf = simdata_dir + "Final.recode.vcf" for f in [fullvcf, filtvcf]: print("Finalizing - {}".format(f)) ## Naming the new outfiles as .snps.vcf outfile = simdata_dir + f.split("/")[-1].split(".vcf")[0] + ".snps.vcf" cmd = "export PATH={}:$PATH; vcfallelicprimitives {} > ddoc-tmp.vcf".format(DDOCENT_DIR, f) print(cmd) #!$cmd cmd = "export PATH={}:$PATH; vcftools --vcf ddoc-tmp.vcf --remove-indels --recode --recode-INFO-all --out {}".format(DDOCENT_DIR, outfile) #!$cmd get_ipython().system('rm ddoc-tmp.vcf') # # Empirical Analysis # An empirical analysis was conducted on the published data set from Eaton & Ree (2013), # available de-multiplexed in fastQ format at the ncbi sequence read archive (SRA072507). # The data are already demultiplexed ipyrad step 1 and stacks `process_radtags` # are skipped. Because Stacks and ipyrad/PyRAD use different quality filtering methods we # employed filtering by ipyrad only. This way quality filtering does not affect the results # of comparison between the programs. After trimming the barcode and adapter # read lengths are 69bp. At .85 similarity this is equivalent to 10 base differences. # # ## ipyrad # Some representative runs to give you an expectation of how long things should take: # # Step 1 - Linking samples: # ``` # real 0m33.881s # user 0m24.455s # sys 0m1.877s # ``` # Step 2 - Filtering Raw Reads: # ``` # real 7m48.600s # user 0m15.081s # sys 0m5.252s # ``` # Step 3 - Clustering within samples (`-c 13`): # ``` # real 29m56.718s # user 0m24.510s # sys 0m1.098s # ``` # Step 3 - Clustering within samples (`-c 40`): # ``` # ## There is some overhead associated with parallelization so speedup # ## isn't quite 2x as fast, but is still considerable. With larger # ## number of samples the speed gain of massive parallelization # ## is pronounced. # # real 18m59.515s # user 0m23.856s # sys 0m1.058s # ``` # # All steps 1-7 with `-c 40`: # ``` # real 50m25.647s # user 9m34.733s # sys 0m20.489s # ``` # # All steps 1-7 with `-c 13`: # ``` # real 69m26.015s # user 9m1.946s # sys 0m20.285s # ``` # # # ### ipyrad params file (params-example.txt) # Next, we write out the ipyrad params file to the ipyrad working direcotory. # In[198]: ## I zipped and uuencoded the params file so it wouldn't take up so much visual room ## Lol params = 'begin 666 \nM>)R55EUOVS@0?->O6" /9_=LV9:3-"W: XRVUQ1HTR:7>RH*@I9HFX4D*B25\nMQ/WU-R0E1W$3VR<$"+FD9[@?L^1PZ#^2U5KSC"JN>6%H(7-!O=MX\'$_CR4E_\nM>/@777V8?7X_NY[1,]_1$7T?_Z#OW!A1S/,U*WDA?KRF63,G-X_I7R,RLLK/\nM2-6VJBUE4HO4*BT%3J@TM1!DK*A,%(_V<6\\.,,$!*JU^ HX!%?3?PLQQ4*_@\nMF2!94EIK-Y<+*I6E2@LC2MN/=L(\'@@0$FM^Q!3?VAE7_#F [!IM1VHKL>;<>N3,*NQ]YE8E2W:J=/"?=ZBB$7:FL6Q_!\nM0KV -" M%D*+,A4#"J8_?[,,-Y:#DG?JDM?^@AEQ4[O!=@+;#=1N"%%V*MJ#\nM_Q+X&;?8"1GE\'9-SDZHMMQ0[_)-,NX[<7&$D&?1=+J79^+Z1;72\nM*+Q+DRHMF%HLC+!@\\E:Z)&^F8*:>*E$G/+="E\\C7K7A[>MR/3O<[-\'%]H9!E\nM)BJ[8@9ID<8ZF3B?<&1O]ZVGL_;@G2R7![$D79:"_]1U+GYC@!V=SJZ\';O4Q\nMQV2,;P_\'-.3&XS492?,:C5(W\'\'?2KD!H?$,PT3@^.]EW;M< / BS"+M9J=P)\nM\\UV Q<%H8_8>H(=ZK;=:CG:<><-QTM14T[I8(4W!;=KZ4-;%\'"Y ?XB%NN-S\nMQ*;=@CX\'A]J>%TT.8\'-*AVIQ?,8S7N&_*]^_O25<,8UU5&E9X#_U)DXIR=N@\nM)*%15Z_V\\[Q\\X$\'B&7Y;L%R43=(Q6B(COJGPS!!?./J&FMSF*#G F;,F=(B,\nM0$I9JDK<5ZT>&RM5@#32AOLM[*C-IH>9:#J@78KT3*WP+[9)+OY A.K2%2ID\nMZ=M!_S%1[PH=Y@K]Y0">9-SPG&_SG -H)1 =]6N]5/9YE@/BEC2B9XT4T,[2\nMVC2I.6H%XN/F5WQ=A!>)4^* )CO$Z F2QHU_+KYUT.\'&$3E3!_I_1:>1./N$\nM3I)O Z.8I+<_B;X/^KB!-BONNN[Y-OI#[%WMH)J>I$%L=DG>,SF]BTQ:AEO\'\nM X\'C ^;N%AH&Y!:OAG;B]TCI7X#7FG;MO ^A(_AK0FT#E1W#@\nMQ8[3MRQ.VJ$:& H#C)2&M\\=_GRU$J0 \n \nend\n' print(params.decode("uu").decode("zlib")) with open(os.path.join(IPYRAD_DIR, "params-example.txt"), 'w') as outfile: outfile.write(params.decode("uu").decode("zlib")) ## If you want to make changes to the params file you can generate a new uuencoded string ## by editing the pyrad/params-pyrad.txt file and running these lines. If you run this ## at a terminal inside python shell then it'll print out nicely as one big fat line. #with open(os.path.join(PYRAD_DIR, "params-pyrad.txt")) as infile: # infile.read().encode("zlib").encode("uu") # ### ipyrad - Import samples and filter raw data # This step should take 5-10 minutes depending on the available resources. # In[76]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$IPYRAD_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "IPYRAD_DIR=$2"\ncd $IPYRAD_DIR\n\n## Import demultiplexed samples into a new assembly and filter the raw data\ntime ipyrad -p params-example.txt -s 12\n') # In[199]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$IPYRAD_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "IPYRAD_DIR=$2"\ncd $IPYRAD_DIR\n\n## Full run with 40 cores\ntime ipyrad -p params-example.txt -s 34567 -f -c 40\n') # ## pyRAD v.3 # In all cases with pyrad analysis we set `N processors` = 13. # # Step 2: # ``` # real 1m37.142s # user 12m39.945s # sys 0m4.051s # ``` # # Running steps 3-7: # ``` # real 250m58.693s # user 1180m7.893s # sys 12m1.399s # ``` # # ### pyRAD params file (params-pyrad.txt) # # In[99]: ## I zipped and uuencoded the params file so it wouldn't take up so much visual room ## Lol params = 'begin 666 \nM>)R-5^]OVCH4_$SNS30?3\nM^^/?=9P *Q >\'TH@^/CF^)QS;[-L-(*::EIQRS4(62^L@9G24*_NKJ[AF6LC\nME(0TC,))"C :90=>0&0Y8-PM.[CU=?KJ]^7,&!UZM7$(?PC])/\nM0LZ!"8T 2J\\._7S]&M*R# 8[6$D(7U0>@IJ!5/*$\\6I16E&7?(E5S43)#0R%\nMNV>A%))#?!$@EHGW0*4;J$>J<\\5X [!G_799#NK9<%Q0[\'_:LQ!R5554,A@Z\nMCJDM K *4C@P[]^]\nM+HR3^PLRSH@9!X.S/N1+K/C[@I8(2Y?P"HRPJ*K?PA;P"[^&=Y!$S0$8SKU(\nMDB@8.B;"B_%AV#A"&S153;OBG"-LH;DI5,F &BR;\\5Q4N,FZ8*\\+35D/,AKL\nMFEIJ5S6? OZ4S!\\-J:G0W3MC[MNAL@5Z?>KJ9BHW@:/ZAGS[*A\nM5>U,AI<4[2*QU%+E"_."9A1):>"1E^HWGI(P_JP/9>-V3.[F9 \\3XQ"W#*>HIA+7@%D\\&HY%ULTC\nMC$#)4_HOSZ#D3ES9/=B3%ILR=H*^\'B(#.!/6G(X>[N*\'T>$"+MH"\nMRC;$&3_9;@8,\';%_<>/5GB>[;(%1F5^GF%Z5,!6U>>%EW_8*3#+&9QG$P3;P\nM3I_9!DZB%K@NG,:_FUQICI7/F@-OT-)T;RP>J3B)6V#L7^B;*3BL*&M2#&+_\nM_H8R6N--$T*2^:C_\'\\!))R \\4*&T@(_DIBTU"J,H)O@W)M!V3&PA&%ZH^P"!\nMQ[W Z8;C6Y\\!MVVHY$H:%/POO\\WX)1_\'@,\\VP&VX^"SYLYJ\';90WS6)GDV/ \nMG4_K4@FV\\M!-(]N\'F+Q=A^RZZ?1@3S9%W]]^,U,/$D=1@%F&^8UJZ;XA[MLM\nM-GI-FIQO@#]CLRP1VL6LD"=-"R(TU\\H8_Z\'9(H247%X&Z[&A![OSG\\:!1%4@\nM%]4C;YHB:Y\'B.$EV!.T[VC\'LSH*HTVHSCI1\\9HD6\\P(C1FZZCR!.\\,.(1($G\nMY/DPOO=,N&V*MH_H06^>-&8&F#AM7,W1J.47<5I_)-,KJ_[@;67N"U\nM#86\\YIK7FS[!W,>VV[_+,%YS50MN_&[Q4>#.AC@]0R%PXL,16B#3VS-5PV\\6\nM$1"KXWAH%35/&ZPX>\\T0_C7@_U"%8@2B3*HUL)_1>K [\nM&W;COZ/;#7[N &L\\M9_JL=UH\\A8VDMM@_S6:8,MS7;;IY*=Y29%4:HR8RXI+\n/NSNQ;&EX=U89_ >8=H2-\n \nend\n' print(params.decode("uu").decode("zlib")) with open(os.path.join(PYRAD_DIR, "params-pyrad.txt"), 'w') as outfile: outfile.write(params.decode("uu").decode("zlib")) ## If you want to make changes to the params file you can generate a new uuencoded string ## by editing the pyrad/params-pyrad.txt file and running these lines. If you run this ## at a terminal inside python shell then it'll print out nicely as one big fat line. #with open(os.path.join(PYRAD_DIR, "params-pyrad.txt")) as infile: # infile.read().encode("zlib").encode("uu") # In[98]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$PYRAD_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "PYRAD_DIR=$2"\ncd $PYRAD_DIR\n\n## Initial filtering\n##\n## pyrad v.3 performance is not improved by increasing the number\n## of cores available beyond the number of samples in the dataset\n## so results here are representative of increased values for\n## param 7 - N Processors (Parallel)\ntime pyrad -p params-pyrad.txt -s 2\n') # In[197]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$PYRAD_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "PYRAD_DIR=$2"\ncd $PYRAD_DIR\n\n## All steps 3-7. We do not include step 2 because we are using\n## the ipyrad filtered data\ntime pyrad -p params-pyrad.txt -s 34567\n') # ## Stacks empirical analysis # # Runtime expectations: # # Ungapped with all flags set as below and 13 cores runtime exceeded 12 hours. # # Gapped alignment with parameters set as below and 40 cores: # ``` # real 466m45.829s # user 17063m15.498s # sys 4m24.564s # ``` # # Ungapped all default assembly parameters: # ``` # real 14m49.323s # user 20m29.222s # sys 0m49.756s # ``` # # Explanantions of all the stacks parameters. These settings are (close to being) analagous to the .85 clust similarity used in PyRAD given that read length are 100 bp. # * -m 2: Minimum depth to make a stack # * -M 10: Number of mismatches per locus within individuals # * -N 10: Number of allowed mismatches in secondary reads # * -n 10: Number of mismatches when clustering across individuals # # Housekeeping flags: # * -T 40: Number of threads # * -b 1: Batch id (Can be anything) # * -S: Disable sql # # Populations flags: # * -m 6: Minimum stack depth per locus when clustering across samples # ustacks flags: # * --max_locus_stacks 2: # # In[182]: ## We build the stacks command w/ python because it'd be a pain ## to do what the glob() call is doing in bash. Then we call ## the command inside a bash cell because denovo_map expects ## all the submodules to be in the PATH ## Read in the filtered fastq from ipyrad step 2 IPYRAD_EDITS_DIR = os.path.join(IPYRAD_OUTPUT, "REALDATA_edits/") infiles = ["-s "+ff+" " for ff in glob.glob(IPYRAD_EDITS_DIR+"*_R1_*")] ## Toggle the dryrun flag for testing DRYRUN="" DRYRUN="-d" OUTPUT_FORMATS = "--vcf --genepop --structure --phase --fastphase --phylip " cmd = "denovo_map.pl -m 2 -M 10 -N 10 -n 10 -T 40 -b 1 -S " + DRYRUN\ + " -X \'populations:" + OUTPUT_FORMATS + "\'"\ + " -X \'populations:-m 6\' -X \'ustacks:--max_locus_stacks 2\' "\ + " -o " + STACKS_UNGAP_OUT + " " + "".join(infiles) print(cmd) # In[179]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$STACKS_DIR" "$cmd"', 'export PATH="$1/miniconda/bin:$PATH"; export "STACKS_DIR=$2"; export "cmd=$3"\n\n## We have to play a little cat and mouse game here because of quoting in some of the args\n## and how weird bash is we have to write the cmd to a file and then exec it.\n## If you try to just run $cmd it truncates the command at the first single tic. Hassle.\ncd $STACKS_DIR\necho $cmd > stacks.sh; chmod 777 stacks.sh\ntime ./stacks.sh\n') # In[200]: ## Do Gapped alignment ## Read in the filtered fastq from ipyrad step 2 IPYRAD_EDITS_DIR = os.path.join(IPYRAD_OUTPUT, "REALDATA_edits/") infiles = ["-s "+ff+" " for ff in glob.glob(IPYRAD_EDITS_DIR+"*_R1_*")] ## Toggle the dryrun flag for testing DRYRUN="" #DRYRUN=" -d " OUTPUT_FORMATS = "--vcf --genepop --structure --phase --fastphase --phylip " cmd = "denovo_map.pl -m 2 -M 10 -N 10 -n 10 -T 40 -b 1 -S --gapped " + DRYRUN\ + " -X \'populations:" + OUTPUT_FORMATS + "\'"\ + " -X \'populations:-m 6\' -X \'ustacks:--max_locus_stacks 2\' "\ + " -o " + STACKS_GAP_OUT + " " + "".join(infiles) print(cmd) # In[201]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$STACKS_DIR" "$cmd"', 'export PATH="$1/miniconda/bin:$PATH"; export "STACKS_DIR=$2"; export "cmd=$3"\n\n## Run the gapped alignment\ncd $STACKS_DIR\necho $cmd > stacks.sh; chmod 777 stacks.sh\ntime ./stacks.sh\n') # In[189]: ## Do all default settings IPYRAD_EDITS_DIR = os.path.join(IPYRAD_OUTPUT, "REALDATA_edits/") infiles = ["-s "+ff+" " for ff in glob.glob(IPYRAD_EDITS_DIR+"*_R1_*")] ## Toggle the dryrun flag for testing DRYRUN="" #DRYRUN="-d" OUTPUT_FORMATS = "--vcf --genepop --structure --phase --fastphase --phylip " cmd = "denovo_map.pl -T 40 -b 1 -S " + DRYRUN\ + " -X \'populations:" + OUTPUT_FORMATS + "\'"\ + " -o " + STACKS_DEFAULT_OUT + " " + "".join(infiles) print(cmd) # In[190]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" "$STACKS_DIR" "$cmd"', 'export PATH="$1/miniconda/bin:$PATH"; export "STACKS_DIR=$2"; export "cmd=$3"\n\n## Run with all defaults\ncd $STACKS_DIR\necho $cmd > stacks.sh; chmod 777 stacks.sh\ntime ./stacks.sh\n') # ## aftrRAD empirical analysis # * AftrRAD.pl - 20164m43s # * Genotype.pl - 879m18s # * FilterSNPs.pl - 574m31s # In[14]: import subprocess import gzip ## Set up directory structures. change the force flag if you want to ## blow everything away and restart # force = True force = "" if force and os.path.exists(AFTRRAD_OUTPUT): shutil.rmtree(AFTRRAD_OUTPUT) if not os.path.exists(AFTRRAD_OUTPUT): os.makedirs(AFTRRAD_OUTPUT) os.chdir(AFTRRAD_OUTPUT) ## Prep empirical data by copying it to a local folder called DemultiplexedFiles ## in this way you don't need the Barcodes/Data dir demux_dir = os.path.join(AFTRRAD_OUTPUT, "DemultiplexedFiles/") if force and os.path.exists(demux_dir): shutil.rmtree(demux_dir) if not os.path.exists(demux_dir): os.makedirs(demux_dir) IPYRAD_EDITS_DIR = os.path.join(IPYRAD_OUTPUT, "REALDATA_edits/") infiles = [ff for ff in glob.glob(IPYRAD_EDITS_DIR+"*_R1_*")] ## Don't recopy the data over and over again while testing if not os.path.exists(demux_dir+infiles[0].split("/")[-1]): for f in infiles: shutil.copy2(f, demux_dir+f.split("/")[-1]) ## AfterRAD is incredibly picky about where scripts and binaries are so you ## have to do a bunch of annoying housekeeping. cmd = "cp -rf {}/* {}".format(os.path.join(AFTRRAD_DIR, "AftrRAD/AftrRADv5.0"), AFTRRAD_OUTPUT) os.system(cmd) shutil.copy2(os.path.join(AFTRRAD_DIR, "dnaMatrix"), AFTRRAD_OUTPUT) shutil.copy2(os.path.join(AFTRRAD_DIR, "ACANA"), AFTRRAD_OUTPUT) ## This first line of nonsense is to make it so perl can find the Parallel::ForkManager library base = "export PATH={}perl5/bin:$PATH; cpanm --local-lib={}/perl5 local::lib && eval $(perl -I {}/perl5/lib/perl5/ -Mlocal::lib); ".format(WORK_DIR, WORK_DIR, WORK_DIR) ## Good sub for testing #cmd = base + "time perl AftrRAD.pl -h" #output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) #print(output) ## numIndels-3 What do here? ## stringLength-99 We don't care about homoplymers here, so set it very high ## maxH-90 ?? ## AftrRAD manu analysis settings - P2: noP2; minDepth: 5; numIndel: 3; MinReads: 5 cmd = base + "time perl AftrRAD.pl minDepth-6 P2-noP2 minIden-85 " \ + "re-TGCAG stringLength-99 dplexedData-1 maxProcesses-40" print(cmd) output = "" output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) print(output) ## MinReads is mindepth_statistical cmd = base + "time perl Genotype.pl MinReads-6 subset-0 maxProcesses-40" print(cmd) output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) print(output) ## pctScored is min_samples_locus as a percent of total samples, here since ## we use 2 for min_samples_locus and there are 12 simulated samples we use 17% cmd = base + "time perl FilterSNPs.pl pctScored-17" print(cmd) output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) print(output) # ### Post-process aftrRAD output so we actually can get a vcf file # In[ ]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR" -s "$AFTRRAD_DIR"', 'export PATH="$1/miniconda/bin:$PATH"; export "WORK_DIR=$1"; export "AFTRRAD_DIR=$2"\ncd $AFTRRAD_DIR/REALDATA/Formatting\n\nperl OutputFasta.pl SNPsOnly-1\nsnp-sites -v -o REALDATA.vcf SNPMatrix_*.All.fasta\n') # ## dDocent empirical analysis # Runtime: ~ 15 minutes # In[ ]: import ipyrad as ip import subprocess import glob ## Set up directory structures. change the force flag if you want to ## blow everything away and restart # force = True force = ""if force and os.path.exists(DDOCENT_OUTPUT): shutil.rmtree(DDOCENT_OUTPUT) if not os.path.exists(DDOCENT_OUTPUT): os.makedirs(DDOCENT_OUTPUT) os.chdir(DDOCENT_OUTPUT) ## Have to copy all the raw files here so ddocent can find them IPYRAD_EDITS_DIR = os.path.join(IPYRAD_OUTPUT, "REALDATA_edits/") infiles = [ff for ff in glob.glob(IPYRAD_EDITS_DIR+"*_R1_*")] ## Don't recopy the data over and over again while testing if not os.path.exists(infiles[0].split("/")[-1]): for f in infiles: shutil.copy2(f, f.split("/")[-1]) ## Now we have to rename all the files in the way dDocent expects them: ## 1A_0_R1_.fastq.gz -> Pop1_Sample1.F.fq.gz name_mapping = {"29154_superba_R1_.fastq":"Pop1_Sample1.F.fq", \ "32082_przewalskii_R1_.fastq":"Pop2_Sample1.F.fq", \ "33588_przewalskii_R1_.fastq":"Pop2_Sample2.F.fq",\ "35236_rex_R1_.fastq":"Pop3_Sample1.F.fq", "39618_rex_R1_.fastq":"Pop3_Sample2.F.fq", \ "35855_rex_R1_.fastq":"Pop3_Sample3.F.fq", "40578_rex_R1_.fastq":"Pop3_Sample4.F.fq", \ "38362_rex_R1_.fastq":"Pop3_Sample5.F.fq", \ "41954_cyathophylloides_R1_.fastq":"Pop4_Sample1.F.fq", "30686_cyathophylla_R1_.fastq":"Pop4_Sample2.F.fq", \ "41478_cyathophylloides_R1_.fastq":"Pop4_Sample3.F.fq",\ "30556_thamno_R1_.fastq":"Pop5_Sample1.F.fq", "33413_thamno_R1_.fastq":"Pop5_Sample2.F.fq"} for k,v in name_mapping.items(): os.rename(k, v) ## Only runs on gzip files. get_ipython().system('gzip *.fq') ## Write out the config file for this run. ## Compacted the config file into one long line here to make it not take up so much room config_file = "empirical-config.txt".format(dir) with open(config_file, 'w') as outfile: outfile.write('Number of Processors\n40\nMaximum Memory\n0\nTrimming\nyes\nAssembly?\nyes\nType_of_Assembly\nSE\nClustering_Similarity%\n0.85\nMapping_Reads?\nyes\nMapping_Match_Value\n1\nMapping_MisMatch_Value\n3\nMapping_GapOpen_Penalty\n5\nCalling_SNPs?\nyes\nEmail\nwatdo@mailinator.com\n') cmd = "export LD_LIBRARY_PATH={}/freebayes-src/vcflib/tabixpp/htslib/:$LD_LIBRARY_PATH; ".format(DDOCENT_DIR) cmd += "export PATH={}:$PATH; time dDocent {}".format(DDOCENT_DIR, config_file) print(cmd) ## Have to run the printed command by hand from the ddocent REALDATA dir bcz it doesn't like running in the notebook #!$cmd ## NB: Must rename all the samples in the output vcf and then use vcf-shuffle-cols ## perl script in the vcf/perl directory to reorder the vcf file to match ## the output of stacks and ipyrad for pca/heatmaps to work. # dDocent makes you rename your samples and then it orders the vcf according to # these new sample names, so the order is different from ipyrad/pyrad/stacks # which fsck plotting downstream, so we have to relabel all the columns # to have the familiar names and then reorder them to be in the same order as the other progs. # In[ ]: ## There's probably a better way to do this for f in ["Final.recode.vcf", "TotalRawSNPs.vcf"]: vcffile = os.path.join(DDOCENT_OUTPUT, f) infile = open(vcffile,'r') filedata = infile.readlines() infile.close() outfile = open(vcffile,'w') for line in filedata: if "CHROM" in line: for ipname, ddname in name_mapping.items(): ipname = ipname.split("_R1")[0] ddname = ddname.split(".")[0] line = line.replace(ddname, ipname) outfile.write(line) outfile.close() ## Now we have to reorder the genotype columns IPYRAD_VCF = os.path.join(IPYRAD_OUTPUT, "REALDATA_outfiles/REALDATA.vcf") os.chdir(os.path.join(DDOCENT_DIR, "vcftools_0.1.11/perl")) tmpvcf = os.path.join(DDOCENT_DIR, "ddocent-tmp.vcf") cmd = "perl vcf-shuffle-cols -t {} {} > {}".format(IPYRAD_VCF, vcffile, tmpvcf) print(cmd) os.system(cmd) os.rename(tmpvcf, vcffile) # # Results # # ## Addendum # # This is me carping about organization of the stacks manual. The walkthrough doesn't mention # that you _don't_ need to use mysql until somewhere in the late-middle of the process, long # after you actually have to do the install in order to get it running. # ``` # ## NB: If you pass the `-S` flag to denovo_map.pl then it won't require the sql backend. # ## This message is totally buried in the docs. # # ## Also `--overw_db` and `--create_db` flags would have been good to know about. # # ## denovo_map.pl refused to run w/o mysql (command not found error) so I had to brew install it # brew install mysql # mysql.server start # msyql -u root # > GRANT ALL ON *.* TO 'stacks_user'@'localhost' IDENTIFIED BY 'stackspassword'; # > create database stacks # # ## Populate the mysql database with the right tables # mysql -u root stacks < /usr/local/share/stacks/sql/stacks.sql # # ## Edit the /usr/local/share/stacks/sql/mysql.cnf.dist file to reflect the username and password as above # ## then copy this file to /usr/local/share/stacks/sql/mysql.cnf # ``` # # In[ ]: