Download the compressed fastq files from Zac's github repo.
%%bash
## download MiSeq data from Zac's github repo
curl -LkO https://github.com/zacforsman/example_ezRAD_data/raw/master/ipyrad_formatted_data.tar.gz
## de-compress it
tar -xzvf ipyrad_formatted_data.tar.gz
## download mt genome file
curl -LkO https://github.com/zacforsman/example_ezRAD_data/raw/master/Achatinella_sowerbyana.fasta
ipyrad_formatted_data/ASO1mtreads_R1_.fastq.gz ipyrad_formatted_data/ ipyrad_formatted_data/ASO5mtreads_R1_.fastq.gz ipyrad_formatted_data/ASO2mtreads_R1_.fastq.gz ipyrad_formatted_data/ASO3mtreads_R1_.fastq.gz ipyrad_formatted_data/ASO4mtreads_R1_.fastq.gz ipyrad_formatted_data/ASO6mtreads_R1_.fastq.gz
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 165 100 165 0 0 498 0 --:--:-- --:--:-- --:--:-- 500 100 3775k 100 3775k 0 0 6421k 0 --:--:-- --:--:-- --:--:-- 6421k % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 165 100 165 0 0 1165 0 --:--:-- --:--:-- --:--:-- 1178 100 15399 100 15399 0 0 70933 0 --:--:-- --:--:-- --:--:-- 70933
The data appears to be Miseq data, paired 300bp. The paired reads are also interleaved, i.e., R1 is on a line, and then read2 on the next line, and then another read1; as opposed to the R1 and R2 reads being in separate files which is what I'm used to seeing. I found a short script on this page (http://seqanswers.com/forums/showthread.php?t=38892)%5Bhttp://seqanswers.com/forums/showthread.php?t=38892] showing how to split this kind of data into two separate files. I wrote a Python function to do the same thing, below.
%%bash
gzip -d -c ipyrad_formatted_data/ASO1mtreads_R1_.fastq.gz | head -n 16 | cut -c 1-80
@M02308:132:000000000-ANV18:1:1101:6317:3772 1:N:0:5 GATCTCAATGTTGTTGTTATCTTATAACAGCTTAATAAACAACTTAATTTTCCATGATTAAGATTTACATAGAGAACTAT + CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG @M02308:132:000000000-ANV18:1:1101:6317:3772 2:N:0:5 GATCAAAGAGTCGAAGATTTAACATTAGAAAAGGATTATTATCAATTATTCCTAAAATAAGACTAATATGATTTATTTTA + CCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGGGGGG @M02308:132:000000000-ANV18:1:1101:6319:3792 1:N:0:5 GATCTCAATGTTGTTGTTATCTTATAACAGCTTAATAAACAACTTAATTTTCCATGATTAAGATTTACATAGAGAACTAT + @CCCCGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGFFGGGGGGGGGGGGGGGGFFGGGGGGGGGGCGGGGGC9F @M02308:132:000000000-ANV18:1:1101:6319:3792 2:N:0:5 GATCAAAGAGTCGAAGATTTAACATTAGAAAAGGATTATTATCAATTATTCCTAAAATAAGACTAATATGATTTATTTTA + CCCCCGGGGGGGGGGGGGGGGFGGGGFGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGFGGGDCFGGGFGGG
gzip: stdout: Broken pipe
import itertools
import gzip
def split_miseq(input_file, out_r1, out_r2):
## open the input file
with gzip.open(input_file, 'r') as indat:
## line generator
liner = iter(indat)
qiter = itertools.izip(liner, liner, liner, liner)
## alternate between writing to out1 and out2
out1 = []
out2 = []
out = 1
## iterate until qiter is empty
while 1:
try:
quart = qiter.next()
if out == 1:
out1.append("".join(quart))
out = 2
else:
out2.append("".join(quart))
out = 1
except StopIteration:
break
## write lists to file
with gzip.open(out_r1, 'w') as write1:
write1.write("".join(out1))
with gzip.open(out_r2, 'w') as write2:
write2.write("".join(out2))
import glob
import os
## make new dirctory for split files
newdir = "split-fastqs"
if not os.path.exists(newdir):
os.makedirs(newdir)
## iterate over files
for miseqfile in glob.glob("ipyrad_formatted_data/*.gz"):
## get name from file
name = os.path.basename(miseqfile).split("_", 1)[0]
r1 = os.path.join(newdir, name + "_R1_.fastq.gz")
r2 = os.path.join(newdir, name + "_R2_.fastq.gz")
split_miseq(miseqfile, r1, r2)
%%bash
ls -l split-fastqs
total 3652 -rw-rw-r-- 1 deren deren 282503 Aug 23 12:42 ASO1mtreads_R1_.fastq.gz -rw-rw-r-- 1 deren deren 353785 Aug 23 12:42 ASO1mtreads_R2_.fastq.gz -rw-rw-r-- 1 deren deren 256707 Aug 23 12:42 ASO2mtreads_R1_.fastq.gz -rw-rw-r-- 1 deren deren 319318 Aug 23 12:42 ASO2mtreads_R2_.fastq.gz -rw-rw-r-- 1 deren deren 230505 Aug 23 12:42 ASO3mtreads_R1_.fastq.gz -rw-rw-r-- 1 deren deren 284714 Aug 23 12:42 ASO3mtreads_R2_.fastq.gz -rw-rw-r-- 1 deren deren 147815 Aug 23 12:42 ASO4mtreads_R1_.fastq.gz -rw-rw-r-- 1 deren deren 187029 Aug 23 12:42 ASO4mtreads_R2_.fastq.gz -rw-rw-r-- 1 deren deren 334216 Aug 23 12:42 ASO5mtreads_R1_.fastq.gz -rw-rw-r-- 1 deren deren 426128 Aug 23 12:42 ASO5mtreads_R2_.fastq.gz -rw-rw-r-- 1 deren deren 395910 Aug 23 12:42 ASO6mtreads_R1_.fastq.gz -rw-rw-r-- 1 deren deren 497179 Aug 23 12:42 ASO6mtreads_R2_.fastq.gz
import ipyrad as ip
print 'ipyrad', ip.__version__
ipyrad 0.7.11
## create a denovo assembly object
denovo = ip.Assembly("denovo")
denovo.set_params("project_dir", "analysis-ipyrad")
denovo.set_params("sorted_fastq_path", "split-fastqs/*.gz")
denovo.set_params("mindepth_majrule", 2)
denovo.set_params("datatype", "pairgbs")
denovo.set_params("filter_adapters", 2)
denovo.set_params("restriction_overhang", ("GATC", "GATC"))
New Assembly: denovo
## load the data, filter and cluster it.
denovo.run("123")
denovo.stats
Assembly: denovo [####################] 100% loading reads | 0:00:00 | s1 | [####################] 100% processing reads | 0:00:00 | s2 | [####################] 100% dereplicating | 0:00:00 | s3 | [####################] 100% clustering | 0:00:06 | s3 | [####################] 100% building clusters | 0:00:00 | s3 | [####################] 100% chunking | 0:00:00 | s3 | [####################] 100% aligning | 0:00:07 | s3 | [####################] 100% concatenating | 0:00:00 | s3 |
state | reads_raw | reads_passed_filter | clusters_total | clusters_hidepth | |
---|---|---|---|---|---|
ASO1mtreads | 3 | 2723 | 2721 | 273 | 126 |
ASO2mtreads | 3 | 2362 | 2360 | 254 | 118 |
ASO3mtreads | 3 | 1991 | 1990 | 302 | 146 |
ASO4mtreads | 3 | 1449 | 1447 | 194 | 86 |
ASO5mtreads | 3 | 3220 | 3215 | 297 | 141 |
ASO6mtreads | 3 | 3760 | 3755 | 271 | 129 |
## create a reference assembly object
ref = denovo.branch('reference')
ref.set_params("reference_sequence", "Achatinella_sowerbyana.fasta")
ref.set_params("assembly_method", "reference")
## map to reference (lots of hits to reference, but few clusters...)
ref.run("3", force=True)
ref.stats
Assembly: reference [####################] 100% dereplicating | 0:00:00 | s3 | [####################] 100% mapping | 0:00:01 | s3 | [####################] 100% fetch mapped reads | 0:00:00 | s3 | [####################] 100% chunking | 0:00:00 | s3 | [####################] 100% aligning | 0:00:11 | s3 | [####################] 100% concatenating | 0:00:00 | s3 |
state | reads_raw | reads_passed_filter | refseq_mapped_reads | refseq_unmapped_reads | clusters_total | clusters_hidepth | |
---|---|---|---|---|---|---|---|
ASO1mtreads | 3 | 2723 | 2721 | 1603 | 16 | 1 | 1 |
ASO2mtreads | 3 | 2362 | 2360 | 1522 | 23 | 1 | 1 |
ASO3mtreads | 3 | 1991 | 1990 | 1490 | 25 | 1 | 1 |
ASO4mtreads | 3 | 1449 | 1447 | 951 | 12 | 1 | 1 |
ASO5mtreads | 3 | 3220 | 3215 | 2067 | 17 | 1 | 1 |
ASO6mtreads | 3 | 3760 | 3755 | 2385 | 11 | 1 | 1 |
%%bash
## first 16 lines trimmed at 80 characters per line
zcat analysis-ipyrad/denovo_clust_0.85/ASO1mtreads.clustS.gz | head -n 16 | cut -c 1-80
006c3b4bfd1d393a7bf041856df8344e;size=39;* ----------------------------------------------GATCAAGTAAAATCAAATTTTAAAAATAAAAAAG ab4ec4807c2dca97c970118c1f234dd7;size=2;- ----------------------------------------------GATCAAGTAAAATCAAATTTTAAAAATAAAAAAG 549ea44a0ea3ab0b31deecbfeeb46513;size=1;- ATTATTGCAGATAAGAAGAGGAAAAAGTATATTTGTAGTAATATTAGAACAAGTAAAATCAAATTTTAAAAATAAAAAAG 2d0de67dcdfe85f6ccb42e829eb8864e;size=1;- -------------------------------------------------------AAATCAAATTTTAAAAATAAAAAAG 087c5fd2760f87460d551c3451088469;size=1;+ ------------AAGAAGAGGAAAAAGTATATTTGTAGTAATATTAGAACAAGTAAAATCAAATTTTAAAAATAAAAAAG 7dbb429703111a49e245e199aa005e16;size=1;+ --------------GAAGAGGAAAAAGTATATTTGTAGTAATATTAGAACAAGTAAAATCAAATTTTAAAAATAAAAAAG 914d88a9d5ce8b53c690026d55190a56;size=1;- ------------------------------------AGTAATATTAGAACAAGTAAAATCAAATTTTAAAAATAAAAAAG bfba95cfefb5d8a6ad90f3ca72417a89;size=1;- ----------------------------------------------GATCAAGTAAAATCAAATTTTTAAAATAAAAAAG
gzip: stdout: Broken pipe
denovo.run("4567")
Assembly: denovo [####################] 100% inferring [H, E] | 0:00:00 | s4 | [####################] 100% calculating depths | 0:00:00 | s5 | [####################] 100% chunking clusters | 0:00:00 | s5 | [####################] 100% consens calling | 0:00:01 | s5 | [####################] 100% concat/shuffle input | 0:00:00 | s6 | [####################] 100% clustering across | 0:00:03 | s6 | [####################] 100% building clusters | 0:00:00 | s6 | [####################] 100% aligning clusters | 0:00:01 | s6 | [####################] 100% database indels | 0:00:00 | s6 | [####################] 100% indexing clusters | 0:00:00 | s6 | [####################] 100% building database | 0:00:00 | s6 | [####################] 100% filtering loci | 0:00:00 | s7 | [####################] 100% building loci/stats | 0:00:00 | s7 | [####################] 100% building vcf file | 0:00:00 | s7 | [####################] 100% writing vcf file | 0:00:00 | s7 | [####################] 100% building arrays | 0:00:00 | s7 | [####################] 100% writing outfiles | 0:00:00 | s7 | Outfiles written to: ~/Downloads/ezrad-test/analysis-ipyrad/denovo_outfiles
denovo.stats
state | reads_raw | reads_passed_filter | clusters_total | clusters_hidepth | hetero_est | error_est | reads_consens | |
---|---|---|---|---|---|---|---|---|
ASO1mtreads | 6 | 2723 | 2721 | 273 | 126 | 0.006884 | 0.001928 | 120 |
ASO2mtreads | 6 | 2362 | 2360 | 254 | 118 | 0.010536 | 0.002622 | 108 |
ASO3mtreads | 6 | 1991 | 1990 | 302 | 146 | 0.016154 | 0.002737 | 129 |
ASO4mtreads | 6 | 1449 | 1447 | 194 | 86 | 0.009593 | 0.001640 | 75 |
ASO5mtreads | 6 | 3220 | 3215 | 297 | 141 | 0.016241 | 0.002657 | 129 |
ASO6mtreads | 6 | 3760 | 3755 | 271 | 129 | 0.016831 | 0.002391 | 118 |
denovo.stats_dfs.s7_loci
locus_coverage | sum_coverage | |
---|---|---|
1 | 0 | 0 |
2 | 0 | 0 |
3 | 0 | 0 |
4 | 2 | 2 |
5 | 2 | 4 |
6 | 2 | 6 |
## first 50 lines trimmed to 80 characters per line for easy visualization
cat analysis-ipyrad/denovo_outfiles/denovo.loci | head -n 50 | cut -c 1-80
ASO1mtreads TACCTTGAGGGCAAATATCATATTGGGGAGCGACAGTAATTACTAATCTTGTTAGGGCAATTCC ASO2mtreads TACCTTGAGGKCAAATATCATATTGGGGAGCAACAGTAATTACTAATCTTGTTAGGGCAATTCC ASO3mtreads TACCTTGAGGACAAATATCATATTGGGGAGCAACAGTAATTACTAATCTTGTTAGGGCAATTCC ASO4mtreads TACCTTGAGGGCAAATATCATATTGGGGAGCGACAGTAATTACTAATCTTGTTAGGGCAATTCC ASO5mtreads -----------CAAATATCATATTGGGGAGCAACAGTAATTACTAATCTTGTTAGGGCAATTCC ASO6mtreads --CCTTGAGGACAAATATCATATTGGGGAGCAACAGTAATTACTAATCTTGTTAGGGCAATTCC // * * ASO1mtreads ATGGAGTATTACATTACAAAACAAATAA-TTTAAACTCTAAAAATTATGGCGGCTAAAATAAAA ASO2mtreads --------------------------------------------------------------AA ASO3mtreads ATGGAGTATTACATTACAAAATAAATAATTTTAAACTCTAAAAATTATGGCGGCTAAAATAAAA ASO5mtreads ATGGAGTATTACATTAYAAAATAAATAATTTTAAACTCTAAAAATTATGGCGGCTAAAATAAAA ASO6mtreads ATGGAGTATTATATTATAAAATAAATAATTTTAAACTCTAAAAATTATGGCGGCTAAAATAAAA // - * - ASO1mtreads AAAGAGTCGAAGATTTAACATTAGAAAAGGATTATTATCAATTATTCCTAAAATAAGACTAATA ASO4mtreads AAAGAGTCGAAGATTTAACATTAGAAAAGGATTATTATCAATTATTCCTAAAATAAGACTAATA ASO5mtreads AAAAAGTCGAAGATTTAATATTAGAAAAGGATTATTATCAATTATTCCTAAAATAAGACTGATA ASO6mtreads AAAAAGTCGAAGATTTAATATTAGAAAAGGATTATTATCAATTATTCCTAAAATAAGACTGATA // * * * ASO1mtreads TGTTAATAATATAGTAATAGCACCAGCTAATACAGGTAATGATAATAATAATAAAAAAACTGTA ASO2mtreads TGTTAATAATATAGTAATAGCNCCAGCTAATACAGGTAATGANAANANTAATAAAAAAACTGTA ASO3mtreads TGTTAATAATATAGTAATAGCACCAGCTAATACAGGTAATGATAATAATAATAAAAAAACTGTA ASO4mtreads TGTTAATAATATAGTAATAGCACCAGCTAATACAGGTAATGATAATAATAATAAAAAAACTGTA ASO5mtreads TGTTAATAATATAGTAATAGCACCAGCTAATACAGGTAATGAYAATAATAATAAAAAAACTGTA ASO6mtreads TGTTAATANTATAGTAATAGCACCAGCTAATACAGGTAATGAYAATAATAATAAAAAAACTGTA // * ASO1mtreads AGCAATCGCGGTCATACTTTAATAAAAATTTTAAATATAATTAATATTTTAATTTACATAAATA ASO2mtreads AGCAATCGCGGTCATACTTTAATAAAAATTTTAAATATAATTAATATTTTAATTTACATAAATA ASO3mtreads AGCAATCGCGGTCATACTTTAATAAAAATTTTAAATATAATTAATATTTTAATTTACATAAATA ASO5mtreads AGCAATCGCGGTCATACTTTAATAAAAATTTTAAATATAATTAATATTTTAATTTACATAAATA // ASO1mtreads -----------------------------GATTACGAGCTTCCGCTCAATCTATTTCTTATGAA ASO3mtreads CTCAAACAGTGCATATTCTTTTTTAGGAAGATTACGAGCTTCCGCTCAATCTATTTCTTATGAA ASO4mtreads ATCAAACAGGGCATATTCTTTTTTAGGAAGATTACGAGCTTCCGCTCAATCTATTTCTTATGAA ASO5mtreads CTCAAACAGTGCATATTCTTTTTTAGGAAGATTACGAGCTTCCGCTCAATCTATTTCTTATGAA ASO6mtreads CTCAAACAGTGCATATTCTTTTTTAGGAAGATTACGAGCTTCCGCTCAATCTATTTCTTATGAA // - -
## conda install toytree -c eaton-lab
## conda install raxml -c bioconda
## pip install -U toyplot
### infer a quick tree
import ipyrad.analysis as ipa
import toytree
rax = ipa.raxml(name=denovo.name, data=denovo.outfiles.phy, N=100)
rax.run(force=True)
job denovo finished successfully
! cat $rax.trees.bipartitions
(ASO6mtreads:0.00096493071506020670,(ASO3mtreads:0.00331289126737833268,(ASO2mtreads:0.00000100000050002909,(ASO1mtreads:0.00195360187740938919,ASO4mtreads:0.00102686644445994734)100:0.01291933045876193960)54:0.00191331539519988786)63:0.00258728830268780069,ASO5mtreads:0.00109661776340834178);
tre = toytree.tree(rax.trees.bipartitions)
tre.draw(
width=300,
use_edge_lengths=True,
node_labels=tre.get_node_values('support'));