#!/usr/bin/env python # coding: utf-8 # # Cookbook: Merging samples or Assemblies # ipyrad is designed to allow you to work with your samples in a very modular way. Meaning you can easily split your Assembly objects to group different samples into different runs that apply different parameter settings, and then you can merge them back together later. Similarly, with ipyrad you can easily merge multiple samples into a single sample, which may be useful when you include technical replicates in your sequencing lanes, or sequence an individual on multiple lanes. # In[1]: import ipyrad as ip # ### Merging assemblies # This will combine all samples from the two assemblies into a single assembly. If some samples have the same name in both assemblies they will be merged into one sample in the merged assembly. # In[2]: # demux first lane of data data1 = ip.Assembly("data1") data1.params.raw_fastq_path = "./ipsimdata/pairddrad_example_R*.gz" data1.params.barcodes_path = "./ipsimdata/pairddrad_example_barcodes.txt" data1.params.assembly_method = "reference" data1.params.reference_sequence = "./ipsimdata/pairddrad_example_genome.fa" data1.params.datatype = "pairddrad" data1.run('1', force=True, launch_client=True) # demux second lane of data data2 = ip.Assembly("data2") data2.params.raw_fastq_path = "./ipsimdata/pairddrad_example_R*.gz" data2.params.barcodes_path = "./ipsimdata/pairddrad_example_barcodes.txt" data2.params.assembly_method = "reference" data2.params.reference_sequence = "./ipsimdata/pairddrad_example_genome.fa" data2.params.datatype = "pairddrad" data2.run('1', force=True, launch_client=True) # merge assemblies mdata = ip.merge("alldata", [data1, data2]) mdata.run("234567", force=True, launch_client=True) # ### Merge samples # Sometimes you may wish to merge samples within an Assembly. This can be done like below. # In[2]: # demux first lane of data data1 = ip.Assembly("data1") data1.params.raw_fastq_path = "./ipsimdata/rad_example_R1_.fastq.gz" data1.params.barcodes_path = "./ipsimdata/rad_example_barcodes.txt" data1.run('1', force=True, launch_client=True) # In[3]: # the rename dictionary is applied during merge mergedict = { "1A_0": "A", "1B_0": "A", "1C_0": "A", "1D_0": "D", } merged = ip.merge("merged", data1, rename_dict=mergedict) merged.run("234567", force=True, launch_client=True) merged.stats # In[104]: from ipyrad.assemble.write_outputs import * import ipyparallel as ipp ipyclient = ipp.Client() step = Step7(merged, True, ipyclient) # In[105]: step.split_clusters() step.remote_process_chunks() step.collect_stats() step.store_file_handles() step.remote_build_arrays_and_write_loci() step.remote_write_outfiles() # In[106]: step.remote_fill_depths() # In[107]: step.remote_build_vcf() # In[91]: filler.snpsmap[filler.snpsmap[:, 0] == 2] # In[ ]: # In[102]: #step.remote_fill_depths() sample = step.data.samples["D"] filler = VCF_filler(step.data, step.nsnps, sample) # run function loop for idx in range(len(filler.locbits)): filler.localidx = 0 # locfill function edges = np.load(filler.trimbits[idx]) inds = filler.indbits[idx] filler.loclines = iter(open(filler.locbits[idx], 'r')) while 1: try: filler.yield_loc() except StopIteration: break filler.locsnps = filler.snpsmap[filler.snpsmap[:, 0] == filler.locidx] filler.gtrim = edges[filler.localidx - 1] # denovo enter catgs func if (filler.locsnps.size) and (filler.sname in filler.names): nidx = filler.names.index(filler.sname) sidx = filler.sidxs[nidx] tups = [[int(j) for j in i.split("-")] for i in sidx.split(":")] seq = filler.seqs[nidx] seqarr = np.array(list(seq)) print('idx', idx, 'locidx', filler.locidx - 1) for snp in filler.locsnps[:, 1]: print(snp, seq[snp]) ishift = seq[:snp].count('-') for tup in tups: cidx, coffset = tup pos = snp + (coffset) - ishift print(pos, (snp, filler.gtrim, coffset, ishift)) print(filler.catgs[cidx, pos]) if (pos >= 0) & (pos < filler.maxlen): filler.vcfd[filler.snpidx] += filler.catgs[cidx, pos] filler.snpidx += 1 print(seq) print("") else: print("NOT HERE") filler.snpidx += filler.locsnps.shape[0] # In[77]: seq # In[42]: def merge(name, assemblies, rename_dict=None): """ Creates and returns a new Assembly object in which samples from two or more Assembly objects with matching names are 'merged'. Merging does not affect the actual files written on disk, but rather creates new Samples that are linked to multiple data files, and with stats summed. # merge two assemblies new = ip.merge('newname', (assembly1, assembly2)) # merge two assemblies and rename samples rename = {"1A_0", "A", "1B_0", "A"} new = ip.merge('newname', (assembly1, assembly2), rename_dict=rename) """ # create new Assembly merged = Assembly(name) # one or multiple assemblies? try: _ = len(assemblies) except TypeError: assemblies = [assemblies] # iterate over all sample names from all Assemblies for data in assemblies: # make a deepcopy ndata = copy.deepcopy(data) for sname, sample in ndata.samples.items(): # rename sample if in rename dict if sname in rename_dict: sname = rename_dict[sname] sample.name = sname # is it in the merged assembly already if sname in merged.samples: msample = merged.samples[sname] # update stats msample.stats.reads_raw += sample.stats.reads_raw if sample.stats.reads_passed_filter: msample.stats.reads_passed_filter += sample.stats.reads_passed_filter # append files msample.files.fastqs.append(sample.files.fastqs) msample.files.edits.append(sample.files.edits) # do not allow state >2 at merging (requires reclustering) # if merging WITHIN samples. msample.stats.state = min(sample.stats.state, 2) # merge its stats and files else: merged.samples[sname] = sample # Merged assembly inherits max of hackers values (max frag length) merged.hackersonly.max_fragment_length = max( [i.hackersonly.max_fragment_length for i in assemblies]) # Set the values for some params that don't make sense inside mergers merged_names = ", ".join([i.name for i in assemblies]) merged.params.raw_fastq_path = "Merged: " + merged_names merged.params.barcodes_path = "Merged: " + merged_names merged.params.sorted_fastq_path = "Merged: " + merged_names # return the new Assembly object merged.save() return merged