#!/usr/bin/env python # coding: utf-8 # ## Simulate RAD-seq data # The simulations software simrrls is available at [github.com/dereneaton/simrrls](github.com/dereneaton/simrrls). First we create a directory called `ipsimdata/` and then simulate data and put it in this directory. Simrrls has a few dependencies that are required. # ### Global variables # In[52]: ## name for our sim data directory DIR = "./ipsimdata" ## A mouse MT genome used to stick our data into. INPUT_CHR = "/home/deren/Downloads/MusMT.fa" ## number of RAD loci to simulate NLOCI = 1000 ## number of inserts to reference genome and insert size N_INSERTS = 100 INSERT_SIZE = 50 # ### Set up / clean up directories # In[53]: import os import shutil ## rm sim dir if it exists, else create it. while 1: if os.path.exists(DIR): shutil.rmtree(DIR) else: os.mkdir(DIR) ## make sure dir is finished removing if os.path.exists(DIR): break ## rm testdirs if they exist TESTDIRS = ["./testref1", "./testref2", "./testref3", "./testref4"] for testdir in TESTDIRS: if os.path.exists(testdir): shutil.rmtree(testdir) # ### Simulate the RAD data # In[54]: import simrrls print 'simrrls', simrrls.__version__ # In[55]: import subprocess ## this the bash command-line call to simrrls cmd = """\ simrrls -o {odir}/{oname} -f {form} -dm 10 -ds 2 -I 0.01 -L {nloci} -i1 {imin} -i2 {imax} """ ## simulate rad_example (includes indels) call = cmd.format(odir=DIR, oname="rad_example", form="rad", imin=50, imax=100, nloci=NLOCI) print call subprocess.check_output(call.split()) ## simulate gbs_example (includes indels) call = cmd.format(odir=DIR, oname="gbs_example", form="gbs", imin=50, imax=100, nloci=NLOCI) print call subprocess.check_output(call.split()) ## simulate pairddrad_example (includes indels) call = cmd.format(odir=DIR, oname="pairddrad_example", form="pairddrad", imin=50, imax=100, nloci=NLOCI) print call subprocess.check_output(call.split()) ## simulate gbs_example (includes indels) call = cmd.format(odir=DIR, oname="pairgbs_example", form="pairgbs", imin=50, imax=100, nloci=NLOCI) print call subprocess.check_output(call.split()) ## simulate pairddrad_example (includes indels and merged reads) call = cmd.format(odir=DIR, oname="pairddrad_wmerge_example", form="pairddrad", imin=-50, imax=50, nloci=NLOCI) print call subprocess.check_output(call.split()) ## simulate gbs_example (includes indels and merged reads) call = cmd.format(odir=DIR, oname="pairgbs_wmerge_example", form="pairgbs", imin=-50, imax=50, nloci=NLOCI) print call subprocess.check_output(call.split()) # ## Manufacture a psuedo-genome with hits from sim data # # These functions take simulated rad-data and a "large" input genome (really it could just be a randomly generated fasta), and randomly inserts a handful of simulated rad tags into the genome. This guarantees that reference mapping will actually do something. For PE simulated data R2 reads are reversed before they're inserted, because smalt is using the `-l pe` flag, which looks for reads in this orientation __`--> <--`__. Also, for PE inner mate distance is fixed at 50. If you wanna get ambitious you could draw this value from a distribution, but seems like more effort than it's worth. This wants to be run from __`ipsimdata/`__, but you can run it from anywhere if you update the paths. # # In[56]: import itertools import gzip import random from Bio import SeqIO # ## Function to insert reads into simulated genome # In[57]: ## Utility function def revcomp(sequence): "returns reverse complement of a string" sequence = sequence[::-1].strip()\ .replace("A", "t")\ .replace("T", "a")\ .replace("C", "g")\ .replace("G", "c").upper() return sequence # In[58]: def RAD_to_genome(R1s, R2s, n_inserts, insert_sz, input_chr, out_chr): """ Writes simulated rad data into a genome fasta file. Assumes RAD data file has names formatted like the output from simrrls. """ ## read in the full genome file record = SeqIO.read(input_chr, "fasta") lenchr = len(record.seq) ## read in the RAD data files dat1 = gzip.open(R1s, 'r') qiter1 = itertools.izip(*[iter(dat1)]*4) if R2s: dat2 = gzip.open(R2s, 'r') qiter2 = itertools.izip(*[iter(dat2)]*4) else: qiter2 = itertools.izip(*[iter(str, 1)]*4) ## sample unique reads from rads uniqs = [] locid = 0 while len(uniqs) < n_inserts: ## grab a read and get locus id qrt1 = qiter1.next() qrt2 = qiter2.next() iloc = [] ilocid = int(qrt1[0].split("_")[1][5:]) ## go until end of locus copies while ilocid == locid: iloc.append([qrt1[1].strip(), qrt2[1].strip()]) qrt1 = qiter1.next() qrt2 = qiter2.next() ilocid = int(qrt1[0].split("_")[1][5:]) ## sample one read uniqs.append(random.sample(iloc, 1)[0]) locid += 1 ## insert RADs into genome sloc = 100 for ins in range(n_inserts): ## get read, we leave the barcode on cuz it won't hurt r1 = uniqs[ins][0] r2 = uniqs[ins][1] if not r2: record.seq = record.seq[:sloc]+r1+\ record.seq[sloc:] else: record.seq = record.seq[:sloc]+r1+\ record.seq[sloc:sloc+insert_sz]+\ revcomp(r2)+\ record.seq[sloc+insert_sz:] sloc += 300 ## write to file rlen = len(qrt1[1].strip()) if r2: rlen *= 2 print("input genome is {} bp".format(lenchr)) print('imputed {} loci {} bp in len'.format(n_inserts, rlen)) print("new pseudo-genome is {} bp".format(len(record.seq))) output_handle = open(out_chr, "w") SeqIO.write(record, output_handle, "fasta") output_handle.close() # ## Make pseudo-ref data files # In[59]: ## SE RAD data DATA_R1 = DIR+"/rad_example_R1_.fastq.gz" OUTPUT_CHR = DIR+"/rad_example_genome.fa" RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR) # In[60]: ## SE GBS data DATA_R1 = DIR+"/gbs_example_R1_.fastq.gz" OUTPUT_CHR = DIR+"/gbs_example_genome.fa" RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR) # In[61]: ## PAIR ddRAD data DATA_R1 = DIR+"/pairddrad_wmerge_example_R1_.fastq.gz" DATA_R2 = DIR+"/pairddrad_wmerge_example_R2_.fastq.gz" OUTPUT_CHR = DIR+"/pairddrad_wmerge_example_genome.fa" RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR) # In[62]: ## PAIR GBS data DATA_R1 = DIR+"/pairgbs_wmerge_example_R1_.fastq.gz" DATA_R2 = DIR+"/pairgbs_wmerge_example_R2_.fastq.gz" OUTPUT_CHR = DIR+"/pairgbs_wmerge_example_genome.fa" RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR) # ## Run tests # ### rad_example # In[63]: import ipyrad as ip ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "testref1") data1.set_params(2, DIR+'/rad_example_R1_.fastq.gz') data1.set_params(3, DIR+'/rad_example_barcodes.txt') ## branch into an assembly for reference data2 = data1.branch("reference") data2.set_params(5, 'reference') data2.set_params(6, DIR+'/rad_example_genome.fa') ## assemble both data1.run(force=True) data2.run(force=True) ## check results assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS # ### gbs example # In[64]: import ipyrad as ip ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "testref2") data1.set_params(2, DIR+'/gbs_example_R1_.fastq.gz') data1.set_params(3, DIR+'/gbs_example_barcodes.txt') ## branch into an assembly for reference data2 = data1.branch("reference") data2.set_params(5, 'reference') data2.set_params(6, DIR+'/gbs_example_genome.fa') ## assemble both data1.run(force=True) data2.run(force=True) ## check results assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS # ### pairddrad example # In[65]: import ipyrad as ip ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "testref3") data1.set_params(2, DIR+'/pairddrad_wmerge_example_R1_.fastq.gz') data1.set_params(3, DIR+'/pairddrad_wmerge_example_barcodes.txt') ## branch into an assembly for reference data2 = data1.branch("reference") data2.set_params(5, 'reference') data2.set_params(6, DIR+'/pairddrad_wmerge_example_genome.fa') ## assemble both data1.run(force=True) data2.run(force=True) ## check results assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS # In[66]: import ipyrad as ip ## create an assembly for denovo data1 = ip.Assembly("denovo") data1.set_params(1, "testref4") data1.set_params(2, DIR+'/pairgbs_wmerge_example_R1_.fastq.gz') data1.set_params(3, DIR+'/pairgbs_wmerge_example_barcodes.txt') ## branch into an assembly for reference data2 = data1.branch("reference") data2.set_params(5, 'reference') data2.set_params(6, DIR+'/pairgbs_wmerge_example_genome.fa') ## assemble both data1.run(force=True) data2.run(force=True) ## check results assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000 assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS # ### Clean up test dirs # In[67]: import glob import os ## rm dir if it exists, else create it. for tdir in glob.glob("testref[1-9]"): shutil.rmtree(tdir) ## rm reference index for iref in glob.glob(DIR+"/*_genome.fa.*"): os.remove(iref) # ### Create zipped simdata archive # In[68]: get_ipython().run_cell_magic('bash', '', '## compressed dir/ w/ all data files\ntar -zcvf ipsimdata.tar.gz ipsimdata/* \n')