#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('date') # In[2]: get_ipython().run_cell_magic('bash', '', 'system_profiler SPSoftwareDataType\n') # In[4]: get_ipython().run_cell_magic('bash', '', '#Excludes serial number and hardware UUID\nsystem_profiler SPHardwareDataType | grep -v [SH][ea]\n') # ## Install pyrad and dependencies on Hummingbird # ### Note: Due to permissions issues (i.e. requiring sudo and a password), most of the installation took place outside of the notebook. I've entered most of the commands here anyway, for demonstration purposes. # In[1]: cd /usr/local/bioinformatics/ # In[7]: get_ipython().run_cell_magic('bash', '', 'mkdir pyrad\n') # In[8]: cd pyrad # In[9]: get_ipython().run_cell_magic('bash', '', 'wget https://github.com/dereneaton/pyrad/archive/3.0.66.tar.gz\n') # In[10]: ls # In[11]: get_ipython().run_cell_magic('bash', '', 'tar -zxvf 3.0.66.tar.gz\n') # In[6]: cd /usr/local/bioinformatics/pyrad-3.0.66/ # In[7]: ls # In[8]: setup.py install # In[9]: ls # In[10]: cd .. # In[11]: get_ipython().run_cell_magic('bash', '', 'wget http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86darwin64.tar.gz\n') # In[13]: get_ipython().run_cell_magic('bash', '', 'tar -zxvf muscle3.8.31_i86darwin64.tar.gz\n') # In[14]: ls # In[15]: cd muscle3.8.31_i86darwin64 # In[16]: cat source.sh # In[2]: get_ipython().run_cell_magic('bash', '', 'wget https://github.com/torognes/vsearch/releases/download/v1.11.1/vsearch-1.11.1-osx-x86_64.tar.gz\n') # In[3]: get_ipython().run_cell_magic('bash', '', 'tar -xzf vsearch-1.11.1-osx-x86_64.tar.gz \n') # In[4]: ls # In[5]: cat source.sh # # pyRAD analysis of Olympia oyster PE-GBS data # ## The remainder of the notebook is taken from the [pyRAD example](http://nbviewer.jupyter.org/gist/dereneaton/1f661bfb205b644086cc/PE-GBS_empirical.ipynb). I've made the appropriate changes to reflect file locations on my local system, as well as to the params.txt file. # # ## Any text sections below this line are from the example notebook. # --- # # Example PE-GBS (w/ merged reads) notebook # # Paired-end GBS (or similarly, paired Ez-RAD) data may commonly contain first and second reads that overlap to some degree and therefore should be merged. This notebook will outline how to analyze such data in _pyRAD_ (v.3.03 or newer). I will demonstrate this example using an empirical data set of mine. To begin, we will create an empty directory within our current directory in which to perform the assembly. # In[6]: cd /Volumes/owl/temp/ # In[9]: get_ipython().run_cell_magic('bash', '', 'mkdir oly_gbs_pyrad\n') # In[2]: cd /Volumes/owl/temp/oly_gbs_pyrad/ # In[11]: get_ipython().run_cell_magic('bash', '', 'mkdir -p analysis_pyrad\n') # In[12]: ls # In[17]: get_ipython().run_cell_magic('bash', '', 'time cp /Volumes/owl_web/nightingales/O_lurida/20160223_gbs/*_{1..10}A_*.fq.gz .\n') # In[3]: ls # Next we create an empty params file by calling pyRAD with the -n option. # In[13]: get_ipython().run_cell_magic('bash', '', 'pyrad -n\n') # ## Enter params file arguments # Then we enter some arguments into the params file, which can be done using any text editor. I use the script below to automate it here. Some important arguments for dealing with PE-GBS data in particular include the following: # # + Set the correct datatype. In the case of this analysis, we will use 'pairgbs' for de-multiplexing in step 1, but then switch it to 'merged' for steps 2-7 after we merge the paired reads. # # + The filter setting (param 21) is not needed in this case because we will be using an external read merging program that applies this filtering. # # + We will, however, still probably want to trim the overhanging edges of reads (param 29), which helps to remove barcodes or adapters that were missed. # # + It can also be helpful with this data type to use a low mindepth setting in conjunction with majority rule consensus base calls (param 31). This will tell pyrad to make statistical base calls for all sites with depth >mindepth, but to make majority rule base calls at sites with depth lower than mindepth, but higher than the majority rule minimum depth. I first run the analysis with the option turned off, and then at the end of the tutorial show the difference with it turned on. # #### Let's view the params file # In[4]: cat params.txt # ## Step 1: De-multiplex reads # In[21]: get_ipython().run_cell_magic('bash', '', 'time pyrad -p params.txt -s 1\n') # #### Let's look at the first 100 lines of the stats output for demultiplexing # It looks like most reads from each file were matched to a barcode. # In[22]: get_ipython().run_cell_magic('bash', '', 'head -n 100 analysis_pyrad/stats/s1.sorting.txt\n') # ## Merge paired-reads # For this we will use the program [__PEAR__](https://github.com/xflouris/PEAR). This step is very important. I can pretty much assure you that you will have many paired end reads in your library that overlap, and if you do not merge these reads they can really mess up your analysis. In this case the vast majority of paired reads are overlapping (~80%), and in fact, the merged reads are the only portion of the data that we will be using for analysis. # # The first thing we have to do is unzip the reads if they are gzipped. If you demultiplexed your data in pyrad they will be in the fastq/ directory. # In[24]: get_ipython().run_cell_magic('bash', '', 'mv *.gz /Volumes/owl/temp/oly_gbs_pyrad/analysis_pyrad/\n') # In[25]: ls /Volumes/owl/temp/oly_gbs_pyrad/analysis_pyrad/ # In[26]: mkdir /Volumes/owl/temp/oly_gbs_pyrad/analysis_pyrad/fastq # In[27]: get_ipython().run_cell_magic('bash', '', 'mv /Volumes/owl/temp/oly_gbs_pyrad/analysis_pyrad/*.gz /Volumes/owl/temp/oly_gbs_pyrad/analysis_pyrad/fastq\n') # In[28]: ls /Volumes/owl/temp/oly_gbs_pyrad/analysis_pyrad/fastq # In[5]: get_ipython().run_cell_magic('bash', '', "## For this example I subselect only the samples from \n## this library that start with the letter 'd'\ntime gunzip analysis_pyrad/fastq/*.gz\n") # In[9]: get_ipython().run_cell_magic('bash', '', 'echo analysis_pyrad/fastq/*\n') # I prefer using a stringent filter setting, including the `-q` option which trims parts of the read that are low quality. This often results in removing all reads that are not merged. You can assemble merged and non-merged reads separately, as described in the 'non-merged PE-GBS tutorial', but __here we will focus on just analyzing the merged reads.__ # In[11]: get_ipython().run_cell_magic('bash', '', 'for gfile in analysis_pyrad/fastq/*_1.fq;\n do pear -f $gfile \\\n -r ${gfile/_1.fq/_2.fq} \\\n -o ${gfile/_1.fq/} \\\n -n 33 \\\n -t 33 \\\n -q 10 \\\n -j 20 >> pear.log 2>&1;\ndone\n') # ### Set the data location to the de-multiplexed & merged data # In[16]: cd ./analysis_pyrad/fastq/ # In[17]: ls # In[12]: get_ipython().run_cell_magic('bash', '', "## set location of demultiplexed data that are 'pear' filtered\nsed -i '/## 18./c\\analysis_pyrad/fastq/*.assembled.fastq ## 18. demulti data loc ' ./params.txt\n") # In[21]: cd /Volumes/owl/temp/oly_gbs_pyrad/ # In[22]: get_ipython().run_cell_magic('bash', '', 'cat params.txt\n') # ### Set the datatype to "merged" # In[23]: get_ipython().run_cell_magic('bash', '', 'cat params.txt\n') # ## Step 2: Quality filtering # In[24]: get_ipython().run_cell_magic('bash', '', 'time pyrad -p params.txt -s 2\n') # In[27]: ls # #### Let's take a look at the results # In[29]: get_ipython().run_cell_magic('bash', '', 'cat stats/s2.rawedit.txt\n') # ## Step 3: Clustering # In[30]: get_ipython().run_cell_magic('bash', '', 'pyrad -p params.txt -s 3\n') # #### Let's take a look at the results # In[31]: get_ipython().run_cell_magic('bash', '', 'cat stats/s3.clusters.txt\n') # In[32]: ls # In[33]: cd .. # In[38]: get_ipython().run_cell_magic('bash', '', 'time cp -r /Volumes/owl/temp/oly_gbs_pyrad/ /Volumes/Data/Sam/scratch/oly_gbs_pyrad/\n') # In[39]: cd /Volumes/Data/Sam/scratch/oly_gbs_pyrad/ # #### and let's look at an example cluster file # In[57]: get_ipython().run_cell_magic('bash', '', 'gzip -cd analysis_pyrad/clust.85/d35422.assembled.clustS.gz | head -n 100 | cut -b 1-80\n') # ## Step 4: Parameter estimation # In[40]: get_ipython().run_cell_magic('bash', '', 'time pyrad -p params.txt -s 4\n') # In[41]: get_ipython().run_cell_magic('bash', '', 'cat stats/Pi_E_estimate.txt\n') # ## Step 5: Consenus base calling # # You may want to adjust: # + max cluster size (param 33) # + max number of Ns # + max number of Hs # In[42]: get_ipython().run_cell_magic('bash', '', 'time pyrad -p params.txt -s 5\n') # In[43]: get_ipython().run_cell_magic('bash', '', 'cat stats/s5.consens.txt\n') # ## Cluster across samples # In[44]: get_ipython().run_cell_magic('bash', '', 'time pyrad -p params.txt -s 6\n') # ## Assemble final data set # In[45]: get_ipython().run_cell_magic('bash', '', 'time pyrad -p params.txt -s 7\n') # ## Examine results # In[46]: get_ipython().run_cell_magic('bash', '', 'cat stats/oly_gbs_pyrad.stats\n') # In[47]: get_ipython().run_cell_magic('bash', '', 'head -n 100 outfiles/oly_gbs_pyrad.loci | cut -b 1-88\n') # In[48]: ls stats/ # In[49]: ls outfiles/ # In[50]: get_ipython().run_cell_magic('bash', '', 'head -1 outfiles/oly_gbs_pyrad.snps\n') # In[ ]: