#!/usr/bin/env python
# coding: utf-8

# # Cookbook: Merging samples or Assemblies

# ipyrad is designed to allow you to work with your samples in a very modular way. Meaning you can easily split your Assembly objects to group different samples into different runs that apply different parameter settings, and then you can merge them back together later. Similarly, with ipyrad you can easily merge multiple samples into a single sample, which may be useful when you include technical replicates in your sequencing lanes, or sequence an individual on multiple lanes. 

# In[1]:


import ipyrad as ip


# ### Merging assemblies
# This will combine all samples from the two assemblies into a single assembly. If some samples have the same name in both assemblies they will be merged into one sample in the merged assembly.

# In[2]:


# demux first lane of data
data1 = ip.Assembly("data1")
data1.params.raw_fastq_path = "./ipsimdata/pairddrad_example_R*.gz"
data1.params.barcodes_path = "./ipsimdata/pairddrad_example_barcodes.txt"
data1.params.assembly_method = "reference"
data1.params.reference_sequence = "./ipsimdata/pairddrad_example_genome.fa"
data1.params.datatype = "pairddrad"
data1.run('1', force=True, launch_client=True)

# demux second lane of data
data2 = ip.Assembly("data2")
data2.params.raw_fastq_path = "./ipsimdata/pairddrad_example_R*.gz"
data2.params.barcodes_path = "./ipsimdata/pairddrad_example_barcodes.txt"
data2.params.assembly_method = "reference"
data2.params.reference_sequence = "./ipsimdata/pairddrad_example_genome.fa"
data2.params.datatype = "pairddrad"
data2.run('1', force=True, launch_client=True)

# merge assemblies
mdata = ip.merge("alldata", [data1, data2])
mdata.run("234567", force=True, launch_client=True)


# ### Merge samples
# Sometimes you may wish to merge samples within an Assembly. This can be done like below. 

# In[2]:


# demux first lane of data
data1 = ip.Assembly("data1")
data1.params.raw_fastq_path = "./ipsimdata/rad_example_R1_.fastq.gz"
data1.params.barcodes_path = "./ipsimdata/rad_example_barcodes.txt"
data1.run('1', force=True, launch_client=True)


# In[3]:


# the rename dictionary is applied during merge
mergedict = {
    "1A_0": "A", 
    "1B_0": "A", 
    "1C_0": "A",
    "1D_0": "D",
}
merged = ip.merge("merged", data1, rename_dict=mergedict)
merged.run("234567", force=True, launch_client=True)
merged.stats


# In[104]:


from ipyrad.assemble.write_outputs import *
import ipyparallel as ipp
ipyclient = ipp.Client()
step = Step7(merged, True, ipyclient)


# In[105]:


step.split_clusters()
step.remote_process_chunks()
step.collect_stats()
step.store_file_handles()
step.remote_build_arrays_and_write_loci()
step.remote_write_outfiles()


# In[106]:


step.remote_fill_depths()


# In[107]:


step.remote_build_vcf()


# In[91]:


filler.snpsmap[filler.snpsmap[:, 0] == 2]


# In[ ]:


# In[102]:


#step.remote_fill_depths()

sample = step.data.samples["D"]
filler = VCF_filler(step.data, step.nsnps, sample)

# run function loop
for idx in range(len(filler.locbits)):
    filler.localidx = 0
    
    # locfill function
    edges = np.load(filler.trimbits[idx])
    inds = filler.indbits[idx]
    filler.loclines = iter(open(filler.locbits[idx], 'r'))
    
    while 1:
        try:
            filler.yield_loc()
        except StopIteration:
            break

        filler.locsnps = filler.snpsmap[filler.snpsmap[:, 0] == filler.locidx]
        filler.gtrim = edges[filler.localidx - 1]
        
        # denovo enter catgs func
        if (filler.locsnps.size) and (filler.sname in filler.names):
            nidx = filler.names.index(filler.sname)
            sidx = filler.sidxs[nidx]
            tups = [[int(j) for j in i.split("-")] for i in sidx.split(":")]
            seq = filler.seqs[nidx]
            seqarr = np.array(list(seq))

            print('idx', idx, 'locidx', filler.locidx - 1)

            for snp in filler.locsnps[:, 1]:
                print(snp, seq[snp])
                ishift = seq[:snp].count('-')
                for tup in tups:
                    cidx, coffset = tup
                    pos = snp + (coffset) - ishift            
                    print(pos, (snp, filler.gtrim, coffset, ishift))
                    print(filler.catgs[cidx, pos])
                    if (pos >= 0) & (pos < filler.maxlen):
                        filler.vcfd[filler.snpidx] += filler.catgs[cidx, pos]

                filler.snpidx += 1
            print(seq)
            print("")
        
        else:
            print("NOT HERE")
            filler.snpidx += filler.locsnps.shape[0]


# In[77]:


seq


# In[42]:


def merge(name, assemblies, rename_dict=None):
    """
    Creates and returns a new Assembly object in which samples from two or more
    Assembly objects with matching names are 'merged'. Merging does not affect 
    the actual files written on disk, but rather creates new Samples that are 
    linked to multiple data files, and with stats summed.

    # merge two assemblies
    new = ip.merge('newname', (assembly1, assembly2))

    # merge two assemblies and rename samples
    rename = {"1A_0", "A", "1B_0", "A"}
    new = ip.merge('newname', (assembly1, assembly2), rename_dict=rename)
    """
    # create new Assembly
    merged = Assembly(name)

    # one or multiple assemblies?
    try:
        _ = len(assemblies)
    except TypeError:
        assemblies = [assemblies]

    # iterate over all sample names from all Assemblies
    for data in assemblies:

        # make a deepcopy
        ndata = copy.deepcopy(data)
        for sname, sample in ndata.samples.items():
            
            # rename sample if in rename dict
            if sname in rename_dict:
                sname = rename_dict[sname]
                sample.name = sname
                
            # is it in the merged assembly already
            if sname in merged.samples:
                msample = merged.samples[sname]
                
                # update stats
                msample.stats.reads_raw += sample.stats.reads_raw
                if sample.stats.reads_passed_filter:
                    msample.stats.reads_passed_filter += sample.stats.reads_passed_filter
                
                # append files
                msample.files.fastqs.append(sample.files.fastqs)
                msample.files.edits.append(sample.files.edits)

                # do not allow state >2 at merging (requires reclustering)
                # if merging WITHIN samples.
                msample.stats.state = min(sample.stats.state, 2)

            # merge its stats and files
            else:
                merged.samples[sname] = sample

    # Merged assembly inherits max of hackers values (max frag length)
    merged.hackersonly.max_fragment_length = max(
        [i.hackersonly.max_fragment_length for i in assemblies])

    # Set the values for some params that don't make sense inside mergers
    merged_names = ", ".join([i.name for i in assemblies])
    merged.params.raw_fastq_path = "Merged: " + merged_names
    merged.params.barcodes_path = "Merged: " + merged_names
    merged.params.sorted_fastq_path = "Merged: " + merged_names

    # return the new Assembly object
    merged.save()
    return merged