#!/usr/bin/env python # coding: utf-8 # # Simulated RAD-seq data for ipyrad testing # The simulations software simrrls is available at [github.com/dereneaton/simrrls](github.com/dereneaton/simrrls). # # # In[1]: import os import glob import gzip import itertools import numpy as np import simrrls import ipyrad as ip print "simrrls v.{}".format(simrrls.__version__) print "ipyrad v.{}".format(ip.__version__) # ### Organization # In[31]: ## the dir to write data to DIR = "./ipsimdata/" ## if it doesn't exist make it if not os.path.exists(DIR): os.makedirs(DIR) ## if theres anything in it, remove it for oldfile in glob.glob(os.path.join(DIR, "*")): os.remove(oldfile) # ## Simulate the data with simrrls # In[32]: get_ipython().run_cell_magic('bash', '-s $DIR', '\n## simulate rad_example (includes indels)\nsimrrls -o $1/rad_example -f rad -dm 10 -ds 2 -I 0.05 -L 1000\n\n## simulate larger rad data set\n#simrrls -o $1/rad_large -f rad -dm 10 -ds 2 -I 0.01 -L 10000\n\n## simulate pairddrad_example\nsimrrls -o $1/gbs_example -f gbs -dm 10 -ds 2 -I 0.05 -L 1000\n\n## simulate pairddrad_example\nsimrrls -o $1/pairddrad_example -f pairddrad -dm 10 -ds 2 -L 1000\n\n## simulate pairgbs_example\nsimrrls -o $1/pairgbs_example -f pairgbs -dm 10 -ds 2 -L 1000\n\n## simulate pairddrad_wmerge_example \nsimrrls -o $1/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \\\n -i1 -50 -i2 100 -L 1000\n\n## simulate pairgbs_wmerge_example (mixture of merged and non-merged reads)\nsimrrls -o $1/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \\\n -i1 -50 -i2 100 -L 1000\n') # ## Functions to insert reads into simulated genome # In[33]: import itertools import gzip import numpy as np # In[34]: ## Utility function def revcomp(sequence): "returns reverse complement of a string" sequence = sequence[::-1].strip()\ .replace("A", "t")\ .replace("T", "a")\ .replace("C", "g")\ .replace("G", "c").upper() return sequence # In[35]: def RAD_to_genome(R1s, R2s, n_inserts, insert_low, insert_high, out_chr): """ Create a simulated genome file with RAD data interspersed. """ ## start genome fasta sequence with 5000 random bp genome = np.random.choice(list("ATGC"), 5000) ## read in the RAD data files dat1 = gzip.open(R1s, 'r') qiter1 = itertools.izip(*[iter(dat1)]*4) if R2s: dat2 = gzip.open(R2s, 'r') qiter2 = itertools.izip(*[iter(dat2)]*4) else: qiter2 = itertools.izip(*[iter(str, 1)]*4) ## sample unique reads from rads uniqs = [] locid = 0 while len(uniqs) < n_inserts: ## grab a read and get locus id qrt1 = qiter1.next() qrt2 = qiter2.next() iloc = [] ilocid = int(qrt1[0].split("_")[1][5:]) ## go until end of locus copies while ilocid == locid: iloc.append([qrt1[1].strip(), qrt2[1].strip()]) qrt1 = qiter1.next() qrt2 = qiter2.next() ilocid = int(qrt1[0].split("_")[1][5:]) ## sample one read uniqs.append(iloc[0]) locid += 1 ## insert RADs into genome for ins in range(n_inserts): ## get read, we leave the barcode on cuz it won't hurt r1 = np.array(list(uniqs[ins][0])) r2 = np.array(list(revcomp(uniqs[ins][1]))) ## add R1 to genome genome = np.concatenate((genome, r1), axis=0) if qrt2[0]: ## add insert size howlong = np.random.uniform(insert_low, insert_high) chunk = np.random.choice(list("ATGC"), int(howlong)) #print howlong genome = np.concatenate((genome, chunk), axis=0) ## add read2 genome = np.concatenate((genome, r2), axis=0) ## add spacer between loci genome = np.concatenate((genome, np.random.choice(list("ATGC"), 5000)), axis=0) print("input genome is {} bp".format(genome.shape[0])) print("inserted {} loci into genome".format(n_inserts)) with open(out_chr, "w") as fasta: header = ">MT dna:chromosome chromosome:test:MT:1:{}:1 REF\n"\ .format(genome.shape[0]) fasta.write(header) outseq = list(genome) for i in range(0, len(outseq), 70): fasta.write("{}\n".format("".join(outseq[i:i+70]))) # ## Make 'genome' data files # In[36]: ## Meta args N_INSERTS = 500 INSERT_LOW = 250 INSERT_HIGH = 300 # In[37]: ## SE RAD data DATA_R1 = os.path.join(DIR, "rad_example_R1_.fastq.gz") OUTPUT_CHR = os.path.join(DIR, "rad_example_genome.fa") big = RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_LOW, INSERT_HIGH, OUTPUT_CHR) # In[38]: ## SE GBS data DATA_R1 = os.path.join(DIR, "gbs_example_R1_.fastq.gz") OUTPUT_CHR = os.path.join(DIR, "gbs_example_genome.fa") RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_LOW, INSERT_HIGH, OUTPUT_CHR) # In[39]: ## PAIR ddRAD data DATA_R1 = os.path.join(DIR, "pairddrad_example_R1_.fastq.gz") DATA_R2 = os.path.join(DIR, "pairddrad_example_R2_.fastq.gz") OUTPUT_CHR = os.path.join(DIR, "pairddrad_example_genome.fa") RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_LOW, INSERT_HIGH, OUTPUT_CHR) # In[40]: ## PAIR ddRAD data DATA_R1 = os.path.join(DIR, "pairddrad_wmerge_example_R1_.fastq.gz") DATA_R2 = os.path.join(DIR, "pairddrad_wmerge_example_R2_.fastq.gz") OUTPUT_CHR = os.path.join(DIR, "pairddrad_wmerge_example_genome.fa") RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_LOW, INSERT_HIGH, OUTPUT_CHR) # In[41]: ## PAIR GBS data DATA_R1 = os.path.join(DIR, "pairgbs_wmerge_example_R1_.fastq.gz") DATA_R2 = os.path.join(DIR, "pairgbs_wmerge_example_R2_.fastq.gz") OUTPUT_CHR = os.path.join(DIR, "pairgbs_wmerge_example_genome.fa") RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_LOW, INSERT_HIGH, OUTPUT_CHR) # ## Run tests in ipyrad # ### rad_example # In[43]: ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "tests1") data1.set_params(2, os.path.join(DIR, 'rad_example_R1_.fastq.gz')) data1.set_params(3, os.path.join(DIR, 'rad_example_barcodes.txt')) data1.set_params("max_alleles_consens", 4) ## raise this for now ## branch into an assembly for reference data2 = data1.branch("reference") data2.set_params(5, 'reference') data2.set_params(6, os.path.join(DIR, 'rad_example_genome.fa')) ## branch into an assembly for denovo+reference data3 = data2.branch("denovo-plus") data3.set_params(5, 'denovo+reference') ## branch into an assembly for reference data4 = data2.branch("denovo-minus") data4.set_params(5, 'denovo-reference') ## assemble both data1.run("1234567", force=True) data2.run("1234567", force=True) data3.run("1234567", force=True) data4.run("1234567", force=True) # In[45]: NLOCI = 1000 ## check results assert all(data1.stats.clusters_hidepth == NLOCI) assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI assert all(data2.stats.clusters_hidepth == N_INSERTS) assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS assert all(data3.stats.clusters_hidepth == NLOCI) assert data3.stats_dfs.s7_loci.sum_coverage.max() == NLOCI assert all(data4.stats.clusters_hidepth == NLOCI-N_INSERTS) assert data4.stats_dfs.s7_loci.sum_coverage.max() == NLOCI-N_INSERTS # ### gbs example # In[38]: ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "tests2") data1.set_params(2, os.path.join(DIR, "gbs_example_R1_.fastq.gz")) data1.set_params(3, os.path.join(DIR, "gbs_example_barcodes.txt")) data1.set_params("max_alleles_consens", 4) ## raise this for now ## branch into an assembly for reference data2 = data1.branch("reference") data2.set_params(5, "reference") data2.set_params(6, os.path.join(DIR, "gbs_example_genome.fa")) ## branch into an assembly for denovo+reference data3 = data2.branch("denovo-plus") data3.set_params(5, 'denovo+reference') ## branch into an assembly for reference data4 = data2.branch("denovo-minus") data4.set_params(5, 'denovo-reference') ## assemble both data1.run("1234567", force=True) data2.run("1234567", force=True) data3.run("1234567", force=True) data4.run("1234567", force=True) # In[39]: ## check results assert all(data1.stats.clusters_hidepth == 1000) assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000 assert all(data2.stats.clusters_hidepth == N_INSERTS) assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS assert all(data3.stats.clusters_hidepth == 1000) assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000 assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS) assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS # ### pairddrad without merged reads example # In[52]: ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "tests3") data1.set_params(2, os.path.join(DIR, "pairddrad_example_*.fastq.gz")) data1.set_params(3, os.path.join(DIR, "pairddrad_example_barcodes.txt")) data1.set_params(7, "pairddrad") data1.set_params(8, ("TGCAG", "CCGG")) ## branch into an assembly for reference data2 = data1.branch("reference") data2.set_params(5, "reference") data2.set_params(6, os.path.join(DIR, "pairddrad_example_genome.fa")) ## branch into an assembly for denovo+reference data3 = data2.branch("denovo-plus") data3.set_params(5, 'denovo+reference') ## branch into an assembly for reference data4 = data2.branch("denovo-minus") data4.set_params(5, 'denovo-reference') ## assemble both data1.run("1234567", force=True) data2.run("1234567", force=True) data3.run("1234567", force=True) data4.run("1234567", force=True) # In[54]: ## check results assert all(data1.stats.clusters_hidepth == 1000) assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000 assert all(data2.stats.clusters_hidepth == N_INSERTS) assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS assert all(data3.stats.clusters_hidepth == 1000) assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000 assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS) assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS # In[62]: data2.stats_dfs # ### pairgbs w/ merged reads example # In[ ]: ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "tests4") data1.set_params(2, os.path.join(DIR, "pairgbs_wmerge_example_*.fastq.gz")) data1.set_params(3, os.path.join(DIR, "pairgbs_wmerge_example_barcodes.txt")) data1.set_params(7, "pairgbs") data1.set_params(8, ("TGCAG", "TGCAG")) ##... ## branch into an assembly for denovo+reference data3 = data1.branch("denovo-plus") data3.set_params(5, 'denovo+reference') ## branch into an assembly for reference data4 = data1.branch("denovo-minus") data4.set_params(5, 'denovo-reference') ## assemble both data1.run("1234567", force=True) data2.run("1234567", force=True) data3.run("1234567", force=True) data4.run("1234567", force=True) # In[ ]: ## check results assert all(data1.stats.clusters_hidepth == 1000) assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000 assert all(data2.stats.clusters_hidepth == N_INSERTS) assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS assert all(data3.stats.clusters_hidepth == 1000) assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000 assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS) assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS # ## Create zipped archive of sim data # In[63]: get_ipython().run_cell_magic('bash', '-s $DIR', '## compressed dir/ w/ all data files\ntar -zcvf ipsimdata.tar.gz $1 \n')