#!/usr/bin/env python # coding: utf-8 # # PE ddRAD data analysis with denovo and reference based assembly # Data from "Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant" # # * http://onlinelibrary.wiley.com/doi/10.1002/ece3.3854/full?wol1URL=/doi/10.1002/ece3.3854/full®ionCode=US-NY&identityKey=34b23ec9-4666-4c37-8470-8448e64d6167 # # ## Other PE ddRAD datasets that could have been useful # Ptychadena # * http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190440#sec022 # # Peromyscus (Munshi-South) # # Chinese sea bass (SRP094869) # * Population Genomics Reveals Genetic Divergence and Adaptive Differentiation of Chinese Sea Bass (Lateolabrax maculatus) # * https://link.springer.com/article/10.1007/s10126-017-9786-0#Sec2 # In[29]: import ipyrad as ip import ipyrad.analysis as ipa import ipyparallel as ipp import pandas as pd import toyplot import glob import gzip ## Ptychadena (Stephane) ## * http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190440#sec022 ## Peromyscus (Munshi-South) ## Chinese sea bass (SRP094869) ## Population Genomics Reveals Genetic Divergence and Adaptive Differentiation of Chinese Sea Bass (Lateolabrax maculatus) ## * https://link.springer.com/article/10.1007/s10126-017-9786-0#Sec2 # In[ ]: get_ipython().run_cell_magic('bash', '', 'conda install -c eaton-lab toytree\nconda install -c bioconda sra-tools entrez-direct\n') # In[ ]: get_ipython().run_cell_magic('bash', '', '## Fetch the elephant reference genome\nmkdir ref\nwget ftp://ftp.broadinstitute.org/pub/assemblies/mammals/elephant/loxAfr3/assembly_supers.fasta.gz -o ref/assembly_supers.fasta.gz\n') # In[ ]: get_ipython().run_cell_magic('bash', '', '\n## Data from "Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant"\n## http://onlinelibrary.wiley.com/doi/10.1002/ece3.3854/full?wol1URL=/doi/10.1002/ece3.3854/full®ionCode=US-NY&identityKey=34b23ec9-4666-4c37-8470-8448e64d6167\nipyrad --download SRP126637 raws/\n') # In[39]: ## R1 and R2 were concatenated in the SRA data files so we have to pull them apart. ## This is probably not the most efficient way to do this, but it works. raws = glob.glob("raws/*") print(raws) for r in raws: name = r.split("/")[1].split("_")[0] lines = gzip.open(r).readlines() nlines = len(lines)/2 print("{} {}".format(name, nlines)) with gzip.open("raws/{}_R1_.fastq.gz".format(name), 'wb') as r1: r1.write("".join(lines[:nlines])) with gzip.open("raws/{}_R2_.fastq.gz".format(name), 'wb') as r2: r2.write("".join(lines[nlines:])) # ``` # Fetching project data... # Run spots mates ScientificName SampleName # 0 SRR6371502 241196 0 Loxodonta cyclotis LOC0127 # 1 SRR6371503 1133408 0 Loxodonta cyclotis LOC0122 # 2 SRR6371505 1002779 0 Loxodonta cyclotis LOC0151 # 3 SRR6371506 1582988 0 Loxodonta cyclotis LOC0274 # 4 SRR6371507 2519228 0 Loxodonta cyclotis LOC0263 # 5 SRR6371508 1002140 0 Loxodonta cyclotis LOC0309 # 6 SRR6371509 2894868 0 Loxodonta cyclotis LOC0279 # 7 SRR6371510 1190860 0 Loxodonta cyclotis LOC0311 # 8 SRR6371511 267664 0 Loxodonta cyclotis LOC0310 # 9 SRR6371512 758060 0 Loxodonta cyclotis LOC0040 # 10 SRR6371513 2176494 0 Loxodonta cyclotis LOC0038 # 11 SRR6371514 2319874 0 Loxodonta cyclotis LOC0037 # 12 SRR6371515 906060 0 Loxodonta cyclotis LOC0035 # 13 SRR6371516 1133648 0 Loxodonta cyclotis LOC0051 # 14 SRR6371517 1816948 0 Loxodonta cyclotis LOC0050 # 15 SRR6371518 1471242 0 Loxodonta cyclotis LOC0049 # 16 SRR6371519 1366528 0 Loxodonta cyclotis LOC0041 # 17 SRR6371520 225068 0 Loxodonta cyclotis LOC0121 # 18 SRR6371521 1266382 0 Loxodonta cyclotis LOC0088 # ``` # In[37]: df = pd.DataFrame([x.split("\t") for x in """SRR6371521 SAMN08167496 LOC0088 WG011 178 95 SRX3466825 LOC0088_a Lope SRR6371520 SAMN08167497 LOC0121 WG012 31 16 SRX3466826 LOC0121_a Minkebe SRR6371519 SAMN08167492 LOC0041 WG005 192 102 SRX3466827 LOC0041_a Waka SRR6371518 SAMN08167493 LOC0049 WG008 207 110 SRX3466828 LOC0049_a Ivindo SRR6371517 SAMN08167494 LOC0050 WG009 256 135 SRX3466829 LOC0050_b Ivindo SRR6371516 SAMN08167495 LOC0051 WG010 160 85 SRX3466830 LOC0051_a Ivindo SRR6371515 SAMN08167488 LOC0035 WG001 127 69 SRX3466831 LOC0035_a Minkebe SRR6371514 SAMN08167489 LOC0037 WG002 327 175 SRX3466832 LOC0037_a Lope SRR6371513 SAMN08167490 LOC0038 WG003 307 164 SRX3466833 LOC0038_a Lope SRR6371512 SAMN08167491 LOC0040 WG004 106 56 SRX3466834 LOC0040_a Wonga Wongue SRR6371511 SAMN08167506 LOC0310 WG023 37 20 SRX3466835 LOC0310_a Moukalaba Doudou SRR6371510 SAMN08167507 LOC0311 WG024 168 89 SRX3466836 LOC0311_a Monts de Cristal SRR6371509 SAMN08167504 LOC0279 WG020 408 215 SRX3466837 LOC0279_b South Mulundu SRR6371508 SAMN08167505 LOC0309 WG022 141 75 SRX3466838 LOC0309_a Mayumba SRR6371507 SAMN08167502 LOC0263 WG018 355 188 SRX3466839 LOC0263_a Wonga Wongue SRR6371506 SAMN08167503 LOC0274 WG019 223 117 SRX3466840 LOC0274_a Loango SRR6371505 SAMN08167500 LOC0151 WG015 141 71 SRX3466841 LOC0151_a Moukalaba Doudou SRR6371503 SAMN08167498 LOC0122 WG013 159 85 SRX3466843 LOC0122_a Minkebe SRR6371502 SAMN08167499 LOC0127 WG014 34 18 SRX3466844 LOC0127_a Moukalaba Doudou""".split("\n")]) ## Just get the sequence identifier, the sample name, and the sample location df = df.iloc[:, [0,7,8]] # Launch the cluster by running this command on the command line: # # ipcluster start --n=19 --daemonize # In[5]: ## connect to cluster and check if all the engines are running ipyclient = ipp.Client() ipyclient.ids # In[19]: data = ip.Assembly("Loxodonta") # In[37]: ## set parameters data.set_params("project_dir", "ddrad-denovo") data.set_params("sorted_fastq_path", "raws/*_.fastq") data.set_params("datatype", "pairddrad") data.set_params("restriction_overhang", ("TGCAG", "CATGC")) data.set_params("clust_threshold", "0.90") data.set_params("filter_adapters", "2") data.set_params("max_Hs_consens", (5, 5)) data.set_params("trim_loci", (0, 0, 0, 0)) data.set_params("output_formats", "*") ## see/print all parameters data.get_params() data.write_params(force=True) # In[40]: data.run("12") # In[41]: ## access the stats of the assembly (so far) from the .stats attribute data.stats # In[42]: ## run steps 3-6 of the assembly data.run("34567") # In[7]: kvalues = [2, 3, 4, 5, 6] s = ipa.structure( name="quick", workdir="./analysis-structure", data="./ddrad-denovo/Loxodonta_outfiles/Loxodonta.ustr", ) ## set main params (use much larger values in a real analysis) s.mainparams.burnin = 1000 s.mainparams.numreps = 5000 ## submit N replicates of each test to run on parallel client for kpop in kvalues: s.run(kpop=kpop, nreps=4, ipyclient=ipyclient) ## wait for parallel jobs to finish ipyclient.wait() # In[8]: ## return the evanno table (deltaK) for best K etable = s.get_evanno_table(kvalues) etable # In[17]: ## set some clumpp params s.clumppparams.m = 3 ## use largegreedy algorithm s.clumppparams.greedy_option = 2 ## test nrepeat possible orders s.clumppparams.repeats = 10000 ## number of repeats s.clumppparams ## run clumpp for each value of K #tables = s.get_clumpp_table(kvalues, quiet=True) print(tables) #table = tables[4].sort_values(by=[0, 1, 2, 3]) table = tables[2].sort_values(by=[0, 1]) toyplot.bars( table, width=500, height=200, title=[[i] for i in table.index.tolist()], xshow=False, ); print(table)