#!/usr/bin/env python
# coding: utf-8

# In[6]:


import ipyrad as ip
import ipyparallel as ipp


# In[7]:


ipyclient = ipp.Client()


# In[3]:


data = ip.Assembly("1-pairtest")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CGG"))


# In[3]:


data = ip.Assembly("2-setest")
data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "rad")
data.set_params("restriction_overhang", ("TGCAG", ""))


# In[3]:


data = ip.Assembly("3-refsetest")
data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "rad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "ipsimdata/rad_example_genome.fa")
data.set_params("restriction_overhang", ("TGCAG", ""))
#data.get_params()


# In[5]:


data = ip.Assembly("4-refpairtest")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("restriction_overhang", ("TGCAG", "CGG"))
#data.get_params()


# In[8]:


data = ip.Assembly("5-tortas")
data.set_params("project_dir", "tortas")
data.set_params("sorted_fastq_path", "/home/deren/Dropbox/Maud/fastq-concats/*.gz")
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "/home/deren/Dropbox/Maud/lgeorge.genome.fa")
data.set_params("restriction_overhang", ("CATG", "AATT"))
data.set_params("filter_adapters", 2)


# In[4]:


data.run("12", force=True)


# In[11]:


#data.run("12", force=True)
#data = ip.load_json("1-pairtest.json")
#data = ip.load_json("2-setest.json")
#data = ip.load_json("3-refsetest.json")
#data = ip.load_json("4-refpairtest.json")
data = ip.load_json("tortas/5-tortas.json")


# In[12]:


# In[9]:


from ipyrad.assemble.cluster_across import *


# In[10]:


from ipyrad.assemble.clustmap import *
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
samples = list(s3.data.samples.values())
sample = samples[1]


# In[7]:


s3.data.ipcluster["threads"] = 4
s3.run()


# In[11]:


s3.data.run("45")


# In[16]:


#s3.data.stats
data = s3.data
jobid = 0
samples = list(data.samples.values())[:4]
randomseed = 123


# In[ ]:


# In[19]:


conshandles = [
    sample.files.consens[0] for sample in samples if 
    sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"
conshandles


# In[20]:


## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
    data.dirs.across, 
    "{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
    call = sps.Popen(cmd, stdout=output, close_fds=True)
    call.communicate()

## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
        "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
        "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)


# In[27]:


## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))

proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz")
with open(allhaps, 'w') as output:
    proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
    proc2.communicate()
proc1.stdout.close()


# In[30]:


data.dirs.across = os.path.join(data.name + "_across")
if not os.path.exists(data.dirs.across):
    os.makedirs(data.dirs.across)
    
import ipyrad


# In[31]:


conshandles = [
    sample.files.consens[0] for sample in samples if 
    sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"

## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
    data.dirs.across, 
    "{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
    call = sps.Popen(cmd, stdout=output, close_fds=True)
    call.communicate()

## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
        "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
        "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)

## impute pseudo-haplo information to avoid mismatch at hetero sites
## the read data with hetero sites is put back into clustered data later.
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))

proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz")
with open(allhaps, 'w') as output:
    proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
    proc2.communicate()
proc1.stdout.close()

## now sort the file using vsearch
allsort = groupcons.replace("-catcons.gz", "-catsort.fa")
cmd1 = [ipyrad.bins.vsearch,
        "--sortbylength", allhaps,
        "--fasta_width", "0",
        "--output", allsort]
LOGGER.debug(" ".join(cmd1))
proc1 = sps.Popen(cmd1, close_fds=True)
proc1.communicate()


# In[34]:


from ipyrad.assemble.cluster_across import *

random.seed(randomseed)

## open an iterator to lengthsorted file and grab two lines at at time
allshuf = groupcons.replace("-catcons.gz", "-catshuf.fa")
outdat = open(allshuf, 'wt')
indat = open(allsort, 'r')
idat = izip(iter(indat), iter(indat))
done = 0

chunk = [next(idat)]
while not done:
    ## grab 2-lines until they become shorter (unless there's only one)
    oldlen = len(chunk[-1][-1])
    while 1:
        try:
            dat = next(idat)
        except StopIteration:
            done = 1
            break
        if len(dat[-1]) == oldlen:
            chunk.append(dat)
        else:
            ## send the last chunk off to be processed
            random.shuffle(chunk)
            outdat.write("".join(chain(*chunk)))
            ## start new chunk
            chunk = [dat]
            break

## do the last chunk
random.shuffle(chunk)
outdat.write("".join(chain(*chunk)))

indat.close()
outdat.close()


# In[50]:


def build_clusters_from_cigars(data, sample):
    
    # get all regions with reads. Generator to yield (str, int, int)
    fullregions = bedtools_merge(data, sample).strip().split("\n")    
    regions = (i.split("\t") for i in fullregions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    bamfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')

    # iterate over all regions
    opath = os.path.join(
        data.dirs.clusts, "{}.clustS.gz".format(sample.name))
    out = gzip.open(opath, 'wt')
    idx = 0
    clusters = []
    for reg in regions:
        # uncomment and compare against ref sequence when testing
        #ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
        reads = bamfile.fetch(*reg)

        # store reads in a dict
        rdict = {}

        # paired-end data cluster building
        if "pair" in data.paramsdict["datatype"]:
            
            # match paired reads together in a dictionary            
            for read in reads:
                if read.qname not in rdict:
                    rdict[read.qname] = [read, None]
                else:
                    rdict[read.qname][1] = read

            # sort keys by derep number
            keys = sorted(
                rdict.keys(),
                key=lambda x: int(x.split("=")[-1]), reverse=True)

            # build the cluster based on map positions, orientation, cigar
            clust = []
            for key in keys:
                r1, r2 = rdict[key]
                if r1 and r2:

                    #lref = len(ref[1])
                    lref = reg[2] - reg[1]
                    arr1 = np.zeros(lref, dtype="U1")
                    arr2 = np.zeros(lref, dtype="U1")
                    arr1.fill("-")
                    arr2.fill("-")

                    # how far ahead of the start does this read begin
                    seq = cigared(r1.seq, r1.cigar)
                    start = r1.reference_start - reg[1] 
                    arr1[start:start + len(seq)] = list(seq)
                    
                    seq = cigared(r2.seq, r2.cigar)
                    start = r2.reference_start - reg[1] 
                    arr2[start:start + len(seq)] = list(seq)
                    
                    arr3 = join_arrays(arr1, arr2)
                    pairseq = "".join(arr3)

                    ori = "+"
                    if r1.is_reverse:
                        ori = "-"
                    derep = r1.qname.split("=")[-1]
                    rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
                    clust.append("{}\n{}".format(rname, pairseq))

        # single-end data cluster building
        else:   
            for read in reads:
                rdict[read.qname] = read

            # sort keys by derep number
            keys = sorted(
                rdict.keys(),
                key=lambda x: int(x.split("=")[-1]), reverse=True)

            # build the cluster based on map positions, orientation, cigar
            clust = []
            for key in keys:
                r1 = rdict[key]

                #aref = np.array(list(ref[1]))
                lref = reg[2] - reg[1]
                arr1 = np.zeros(lref, dtype="U1")
                arr1.fill("-")

                # how far ahead of the start does this read begin
                seq = cigared(r1.seq, r1.cigar)
                start = r1.reference_start - reg[1] 
                arr1[start:start + len(seq)] = list(seq)
                aseq = "".join(arr1)

                ori = "+"
                if r1.is_reverse:
                    ori = "-"
                derep = r1.qname.split("=")[-1]
                rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
                clust.append("{}\n{}".format(rname, aseq))

        # store this cluster
        clusters.append("\n".join(clust))
        idx += 1

        # if 1000 clusters stored then write to disk
        if not idx % 10:
            out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
            clusters = []
 
    # write final remaining clusters to disk
    out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
    out.close()


# In[51]:


build_clusters_from_cigars(data, sample)


# In[1]:


#maxlens, depths = get_quick_depths(data, sample)


# In[2]:


#sample_cleanup(data, sample)


# In[18]:


def get_quick_depths(data, sample):
    """ iterate over clustS files to get data """

    ## use existing sample cluster path if it exists, since this
    ## func can be used in step 4 and that can occur after merging
    ## assemblies after step3, and if we then referenced by data.dirs.clusts
    ## the path would be broken.
    if sample.files.clusters:
        pass
    else:
        ## set cluster file handles
        sample.files.clusters = os.path.join(
            data.dirs.clusts, sample.name + ".clustS.gz")

    ## get new clustered loci
    fclust = data.samples[sample.name].files.clusters
    clusters = gzip.open(fclust, 'rt')
    pairdealer = izip(*[iter(clusters)] * 2)

    ## storage
    depths = []
    maxlen = []

    ## start with cluster 0
    tdepth = 0
    tlen = 0

    ## iterate until empty
    while 1:
        ## grab next
        try:
            name, seq = next(pairdealer)
        except StopIteration:
            break

        ## if not the end of a cluster
        #print name.strip(), seq.strip()
        if name.strip() == seq.strip():
            depths.append(tdepth)
            maxlen.append(tlen)
            tlen = 0
            tdepth = 0

        else:
            try:
                tdepth += int(name.strip().split("=")[-1][:-2])
                tlen = len(seq)
            except:
                print(name)

    ## return
    clusters.close()
    return np.array(maxlen), np.array(depths)


# In[14]:


def sample_cleanup(data, sample):
    """ stats, cleanup, and link to samples """

    ## get maxlen and depths array from clusters
    maxlens, depths = get_quick_depths(data, sample)

    ## Test if depths is non-empty, but just full of zeros.
    if not depths.max():
        print("    no clusters found for {}".format(sample.name))
        return 

    else:
        ## store which min was used to calculate hidepth here
        sample.stats_dfs.s3["hidepth_min"] = (
            data.paramsdict["mindepth_majrule"])

        # If our longest sequence is longer than the current max_fragment_len
        # then update max_fragment_length. For assurance we require that
        # max len is 4 greater than maxlen, to allow for pair separators.
        hidepths = depths >= data.paramsdict["mindepth_majrule"]
        maxlens = maxlens[hidepths]

        ## Handle the case where there are no hidepth clusters
        if maxlens.any():
            maxlen = int(maxlens.mean() + (2. * maxlens.std()))
        else:
            maxlen = 0
        if maxlen > data._hackersonly["max_fragment_length"]:
            data._hackersonly["max_fragment_length"] = maxlen + 4

        ## make sense of stats
        keepmj = depths[depths >= data.paramsdict["mindepth_majrule"]]
        keepstat = depths[depths >= data.paramsdict["mindepth_statistical"]]

        ## sample summary stat assignments
        sample.stats["state"] = 3
        sample.stats["clusters_total"] = depths.shape[0]
        sample.stats["clusters_hidepth"] = keepmj.shape[0]

        ## store depths histogram as a dict. Limit to first 25 bins
        bars, bins = np.histogram(depths, bins=range(1, 26))
        sample.depths = {int(i): int(v) for i, v in zip(bins, bars) if v}

        ## sample stat assignments
        ## Trap numpy warnings ("mean of empty slice") printed by samples
        ## with few reads.
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            sample.stats_dfs.s3["merged_pairs"] = sample.stats.reads_merged
            sample.stats_dfs.s3["clusters_total"] = depths.shape[0]
            try:
                sample.stats_dfs.s3["clusters_hidepth"] = (
                    int(sample.stats["clusters_hidepth"]))
            except ValueError:
                ## Handle clusters_hidepth == NaN
                sample.stats_dfs.s3["clusters_hidepth"] = 0
            sample.stats_dfs.s3["avg_depth_total"] = depths.mean()
            sample.stats_dfs.s3["avg_depth_mj"] = keepmj.mean()
            sample.stats_dfs.s3["avg_depth_stat"] = keepstat.mean()
            sample.stats_dfs.s3["sd_depth_total"] = depths.std()
            sample.stats_dfs.s3["sd_depth_mj"] = keepmj.std()
            sample.stats_dfs.s3["sd_depth_stat"] = keepstat.std()

    ## Get some stats from the bam files
    ## This is moderately hackish. samtools flagstat returns
    ## the number of reads in the bam file as the first element
    ## of the first line, this call makes this assumption.
    if not data.paramsdict["assembly_method"] == "denovo":
        ## shorter names
        mapf = os.path.join(
            data.dirs.refmapping, sample.name + "-mapped-sorted.bam")
        umapf = os.path.join(
            data.dirs.refmapping, sample.name + "-unmapped.bam")

        ## get from unmapped
        cmd1 = [ip.bins.samtools, "flagstat", umapf]
        proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
        result1 = proc1.communicate()[0]

        ## get from mapped
        cmd2 = [ip.bins.samtools, "flagstat", mapf]
        proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)
        result2 = proc2.communicate()[0]

        ## store results
        ## If PE, samtools reports the _actual_ number of reads mapped, both 
        ## R1 and R2, so here if PE divide the results by 2 to stay consistent
        ## with how we've been reporting R1 and R2 as one "read pair"
        if "pair" in data.paramsdict["datatype"]:
            sample.stats["refseq_unmapped_reads"] = int(result1.split()[0]) / 2
            sample.stats["refseq_mapped_reads"] = int(result2.split()[0]) / 2
        else:
            sample.stats["refseq_unmapped_reads"] = int(result1.split()[0])
            sample.stats["refseq_mapped_reads"] = int(result2.split()[0])

        unmapped = os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam")
        samplesam = os.path.join(data.dirs.refmapping, sample.name + ".sam")
        for rfile in [unmapped, samplesam]:
            if os.path.exists(rfile):
                os.remove(rfile)

    # if loglevel==DEBUG
    log_level = ip.logger.getEffectiveLevel()
    if not log_level == 10:
        ## Clean up loose files only if not in DEBUG
        ##- edits/*derep, utemp, *utemp.sort, *htemp, *clust.gz
        derepfile = os.path.join(data.dirs.edits, sample.name + "_derep.fastq")
        mergefile = os.path.join(data.dirs.edits, sample.name + "_merged_.fastq")
        uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
        usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort")
        hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
        clusters = os.path.join(data.dirs.clusts, sample.name + ".clust.gz")

        for rfile in [derepfile, mergefile, uhandle, usort, hhandle, clusters]:
            if os.path.exists(rfile):
                os.remove(rfile)


# In[7]:


# optimize speed of this next
build_clusters_from_cigars(data, sample)


# ### Consensus for refmapped data -- store all (even long pairs)?

# In[ ]:


### Write each out to a sam file...
# newconsensus()
# 
data._este = data.stats.error_est.mean()
data._esth = data.stats.hetero_est.mean()

clusters = open(os.path.join(data.dirs.clusts, "{}.clustS.gz".format(sample.name)), 'r')
clusters.read()

# plan to fill an h5 for this sample
tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
with h5py.File(tmp5, 'w') as io5:
    io5.create_dataset("cats", (optim, maxlen, 4), dtype=np.uint32)
    io5.create_dataset("alls", (optim, ), dtype=np.uint8)
    io5.create_dataset("chroms", (optim, 3), dtype=np.int64)

    ## local copies to use to fill the arrays
    catarr = io5["cats"][:]
    nallel = io5["alls"][:]
    refarr = io5["chroms"][:]


# In[ ]:


### Step 6 for refmapped pairs: 
# 1. convert all sams to bams and make a merged mapped-sorted.bam
# 2. get overlapping regions with bedtools_merge()
# 3. pull consens reads in aligned regions with bamfile.fetch()
# 4. store the consensus sequence in h5.
# 5. store the variants in h5.
# 6. store the depth of variants in h5.
# 7. 


# In[7]:


# get all regions with reads. Generator to yield (str, int, int)
fullregions = bedtools_merge(data, sample).strip().split("\n")    
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)

# access reads from bam file using pysam
samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')


# In[34]:


reg = next(regions)
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = samfile.fetch(*reg)


# In[35]:


# match paired reads together in a dictionary
rdict = {}
for read in reads:
    rdict[read.qname] = read

# sort keys by derep number
keys = sorted(
    rdict.keys(),
    key=lambda x: int(x.split("=")[-1]), reverse=True)

# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
    r1 = rdict[key]

    aref = np.array(list(ref[1]))
    arr1 = np.zeros(aref.size, dtype="U1")
    arr1.fill("-")

    # how far ahead of the start does this read begin
    seq = cigared(r1.seq, r1.cigar)
    start = r1.reference_start - reg[1] 
    arr1[start:start + len(seq)] = list(seq)
    aseq = "".join(arr1)

    ori = "+"
    if r1.is_reverse:
        ori = "-"
    derep = r1.qname.split("=")[-1]
    rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
    clust.append("{}\n{}".format(rname, aseq))


# In[36]:


clust


# In[30]:


rdict


# In[16]:


mapping_reads(data, sample, 2)


# In[ ]:


# In[9]:


s3.data.run("5")


# In[11]:


subsamples = list(s3.data.samples.values())
subsamples.sort(key=lambda x: x.stats.clusters_hidepth, reverse=True)
jobs = {}


# In[8]:


from ipyrad.assemble.jointestimate import *
optim(data, sample)


# In[9]:


sample, _, _, nhidepth, maxlen = recal_hidepth(data, sample)


# In[6]:


data
concat_multiple_edits(data, sample)
merge_pairs_with_vsearch(data, sample, True)
merge_end_to_end(data, sample, True, True)


# In[7]:


dereplicate(data, sample, 2)


# In[8]:


cluster(data, sample, 2, 1)


# In[9]:


build_clusters(data, sample, 5)


# In[10]:


muscle_chunker(data, sample)


# In[14]:


for idx in range(10):
    handle = os.path.join(s3.data.tmpdir, 
        "{}_chunk_{}.ali".format(sample.name, idx))
    align_and_parse(handle, 5, 0)


# In[15]:


reconcat(data, sample)


# In[ ]:


# In[6]:


s3.run()


# In[6]:


# In[9]:


maxlens, depths = get_quick_depths(data, sample)
maxlens, depths


# In[ ]:


# In[7]:


data = s3.data
samples = list(s3.data.samples.values())
sample = samples[1]


# In[8]:


sample = samples[1]


# In[9]:


concat_multiple_edits(data, sample)


# In[10]:


merge_end_to_end(data, sample, False)


# In[11]:


dereplicate(data, sample, 2)


# In[12]:


split_endtoend_reads(data, sample)


# In[13]:


mapping_reads(data, sample, 2)


# In[14]:


build_ref_cigars(data, sample)


# In[ ]:


# In[ ]:


#samtools mpileup -uf ref.fa aln1.bam aln2.bam | bcftools view -bvcg - > var.raw.bcf
get_ipython().system(' samtools mpileup -ur /home/deren/Dropbox/opbox/Maud/lgeorge.genome.fa')


# In[9]:


#regions = bedtools_merge(data, sample).strip().split("\n")
fullregions = bedtools_merge(data, sample).strip().split("\n")


# In[10]:


regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)


# In[ ]:


# In[14]:


def get_ref_region(reference, contig, rstart, rend):
    "returns the reference sequence over a given region"
    cmd = [
        ip.bins.samtools, 'faidx', 
        reference,
        "{}:{}-{}".format(contig, rstart + 1, rend),
        ]
    stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()
    name, seq = stdout.decode().split("\n", 1)
    listseq = [name, seq.replace("\n", "")]
    return listseq


# In[527]:


def build_ref_cigars(data, sample):
    
    # get all regions with reads. Generator to yield (str, int, int)
    #fullregions = bedtools_merge(data, sample).strip().split("\n")    
    regions = (i.split("\t") for i in fullregions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    samfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')

    # iterate over all regions
    out = open("test.clustS", 'w')
    idx = 0
    clusters = []
    for reg in regions:
        ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
        reads = samfile.fetch(*reg)

        # match paired reads together in a dictionary
        rdict = {}
        for read in reads:
            if read.qname not in rdict:
                rdict[read.qname] = [read, None]
            else:
                rdict[read.qname][1] = read

        # sort keys by derep number
        keys = sorted(
            rdict.keys(),
            key=lambda x: int(x.split("=")[-1]), reverse=True)

        # build the cluster based on map positions, orientation, cigar
        clust = []
        for key in keys:
            r1, r2 = rdict[key]
            if r1 and r2:

                aref = np.array(list(ref[1]))
                arr1 = np.zeros(aref.size, dtype="U1")
                arr2 = np.zeros(aref.size, dtype="U1")
                arr1.fill("-")
                arr2.fill("-")

                try:
                    # how far ahead of the start does this read begin
                    seq = cigared(r1.seq, r1.cigar)
                    start = r1.reference_start - reg[1] 
                    arr1[start:start + len(seq)] = list(seq)

                    seq = cigared(r2.seq, r2.cigar)
                    start = r2.reference_start - reg[1] 
                    arr2[start:start + len(seq)] = list(seq)

                    arr3 = join_arrays(arr1, arr2)
                    pairseq = "".join(arr3)
                    derep = r1.qname.split("=")[-1]
                    clust.append("{}\n{}".format("{}:{}-{};size={}"
                                                 .format(*reg, derep), pairseq))
                except ValueError:
                    print(reg)
        clusters.append("\n".join(clust))
        idx += 1
        if not idx % 1000:
            out.write("\n//\n//\n".join(clusters))
    out.close()


# In[549]:


def build_ref_cigars(data, sample):
    
    # get all regions with reads. Generator to yield (str, int, int)
    #fullregions = bedtools_merge(data, sample).strip().split("\n")    
    regions = (i.split("\t") for i in fullregions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    samfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')

    # iterate over all regions
    opath = os.path.join(
        data.dirs.refmapping, "{}.clustS.gz".format(sample.name))
    out = gzip.open(opath, 'w')
    idx = 0
    clusters = []
    for reg in regions:
        ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
        reads = samfile.fetch(*reg)

        # match paired reads together in a dictionary
        rdict = {}
        for read in reads:
            if read.qname not in rdict:
                rdict[read.qname] = [read, None]
            else:
                rdict[read.qname][1] = read

        # sort keys by derep number
        keys = sorted(
            rdict.keys(),
            key=lambda x: int(x.split("=")[-1]), reverse=True)

        # build the cluster based on map positions, orientation, cigar
        clust = []
        for key in keys:
            r1, r2 = rdict[key]
            if r1 and r2:

                aref = np.array(list(ref[1]))
                arr1 = np.zeros(aref.size, dtype="U1")
                arr2 = np.zeros(aref.size, dtype="U1")
                arr1.fill("-")
                arr2.fill("-")

                # how far ahead of the start does this read begin
                seq = cigared(r1.seq, r1.cigar)
                start = r1.reference_start - reg[1] 
                arr1[start:start + len(seq)] = list(seq)
                
                seq = cigared(r2.seq, r2.cigar)
                start = r2.reference_start - reg[1] 
                arr2[start:start + len(seq)] = list(seq)
                
                arr3 = join_arrays(arr1, arr2)
                pairseq = "".join(arr3)

                derep = r1.qname.split("=")[-1]
                rname = "{}:{}-{};size={}".format(*reg, derep)
                clust.append("{}\n{}".format(rname, pairseq))
        clusters.append("\n".join(clust))
        idx += 1
        if not idx % 100:
            out.write("\n//\n//\n".join(clusters).encode())
    out.close()


# In[550]:


c = build_ref_cigars(data, sample)


# In[547]:


"\n//\n//\n".join(c).encode()


# In[397]:


regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
build_ref_cigars(data, sample)


# In[438]:


reg = ('Contig0', 41920, 42479)
reg = ('Contig0', 7219468, 7220326)
reg = ('Contig0', 207998, 208219)
reg = ('Contig0', 16738, 16933)
reg = ('Contig0', 49781, 50005)
reg = ('Contig0', 76480, 76711)
reg = ('Contig0', 24391, 24908)
reg = ('Contig0', 7193, 7952)       # has merge
#reg = ('Contig0', 13327, 13845)     # has indels
reg = ('Contig0', 76480, 77015)  # has indels

reg = ('Contig0', 346131, 346193)  # valueerror

samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = list(samfile.fetch(*reg))


# In[519]:


def cigared(sequence, cigartups):
    start = 0
    seq = ""
    for tup in cigartups:
        flag, add = tup
        if flag is 0:
            seq += sequence[start:start + add]
        if flag is 1:
            pass
        if flag is 2:
            seq += "-" * add
            start -= add
        if flag is 4:
            pass
        start += add
    return seq    


# In[456]:


# match paired reads together in a dictionary
rdict = {}
for read in reads:
    if read.qname not in rdict:
        rdict[read.qname] = [read]
    else:
        rdict[read.qname].append(read)

# sort keys by derep number
keys = sorted(
    rdict.keys(),
    key=lambda x: int(x.split("=")[-1]), reverse=True)

# build the cluster based on map positions, orientation, cigar
for key in keys:
    r1, r2 = rdict[key]
    if r1 and r2:
        print(r1.seq, r1.cigar)# r1.pos, r1.aend, r1.rlen, r1.aligned_pairs)
        print(cigared(r1.seq, r1.cigar))


# In[310]:


out = open("test2.txt", 'w')
# build the cluster based on map positions, orientation, cigar
for key in keys:
    r1, r2 = rdict[key]
    if r1 and r2:
        
        # empty arrays
        aref = np.array(list(ref[1]))
        arr1 = np.zeros(aref.size, dtype="U1")
        arr2 = np.zeros(aref.size, dtype="U1")
        arr1.fill("-")
        arr2.fill("-")

        # how far ahead of the start does this read begin
        start = r1.reference_start - reg[1] 
        seq = cigared(r1.seq, r1.cigar)
        arr1[start:start+len(seq)] = list(seq)
        
        seq = cigared(r2.seq, r2.cigar)
        start = r2.reference_start - reg[1] 
        arr2[start:start+len(seq)] = list(seq)
        #print("".join(arr1), file=out)
        #print("".join(arr2), file=out)
        
        arr3 = join_arrays(arr1, arr2)
        print("".join(arr3), file=out)
        
out.close()


# In[307]:


import numba

#numba.jit(nopython=True)
def join_arrays(arr1, arr2):
    arr3 = np.zeros(arr1.size, dtype="U1")
    for i in range(arr1.size):
        
        if arr1[i] == arr2[i]:
            arr3[i] = arr1[i]
        
        elif arr1[i] == "N":
            if arr2[i] == "-":
                arr3[i] = "N"
            else:
                arr3[i] = arr2[i]
                
        elif arr2[i] == "N":
            if arr1[i] == "-":
                arr3[i] = "N"
            else:
                arr3[i] = arr1[i]
        
        elif arr1[i] == "-":
            if arr2[i] == "N":
                arr3[i] = "N"
            else:
                arr3[i] = arr2[i]
        
        elif arr2[i] == "-":
            if arr1[i] == "N":
                arr3[i] = "N"
            else:
                arr3[i] = arr1[i]
                
        else:
            arr3[i] = "N"
    return arr3
    

a1 = np.array(list("AGAGAG-NN----"))
a2 = np.array(list("----ACTTNNTTT"))
de = np.array(list("AGAGANTTNNTTT"))   

a1 = np.array(list("AGAGAG-NN-------"))
a2 = np.array(list("----ACTTNNTTT---"))
de = np.array(list("AGAGANTTNNTTT---"))


# In[308]:


join_arrays(a1, a2)


# In[233]:


# how far ahead of the start does this read begin
start = r1.reference_start - reg[1] 


# In[228]:


aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
arr1.fill("-")


# In[194]:


r1.reference_start, r1.pos, r1.reference_end, r1.cigar


# In[202]:


cigared(r1.seq, r1.cigar)


# In[199]:


# which is forward and reverse
if r1.blocks[0][0] > r2.blocks[0][0]:
    rx = r2
    r2 = r1
    r1 = rx

# get arrs
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")

# fill arr1
seq1 = cigared(r1.seq, r1.cigar)
arr1[]


# In[197]:


ref = get_ref_region(data.paramsdict["reference"], *reg)

def cigar(r1, r2, reg):
    
    # which is forward and reverse
    if r1.blocks[0][0] > r2.blocks[0][0]:
        rx = r2
        r2 = r1
        r1 = rx
        
    # get arrs
    aref = np.array(list(ref[1]))
    arr1 = np.zeros(aref.size, dtype="U1")
    arr2 = np.zeros(aref.size, dtype="U1")
    
    # fill arr1
    
    
    # do they overlap?
    overlap = False
    if max(r1.blocks[0]) > min(r2.blocks[0]):
        overlap = True
        #osegment = r1.blocks[0][1] - r2.blocks[0][0]
        #oseqs = r1.seq[-osegment:], r2.seq[:osegment]
        #print(oseqs)
    
    # modify for cigar
    for edit in r1.cigar:
        r1.seq
        
    
    before = "-" * (r1.pos - reg[1])
    if r1.is_reverse:
        read = before + revcomp(r1.seq)
    else:
        read = before + r1.seq
        
    midns = "-" * (r2.pos - r1.aend)
    read += midns
    
    if r2.is_reverse:
        read += r2.seq
    else:
        read += revcomp(r2.seq)

    after = "-" * (reg[2] - r2.aend)
    read += after
    return read


# In[ ]:


# In[ ]:


# In[ ]:


# In[350]:


def get_ref_region(reference, contig, rstart, rend):
    cmd = [
        ip.bins.samtools, 'faidx', 
        reference, 
        "{}:{}-{}".format(contig, rstart+1, rend)]
    stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()
    name, seq = stdout.decode().split("\n", 1)
    listseq = [name, seq.replace("\n", "")]
    return listseq


# In[250]:


ireg = samfile.fetch(*reg)
rdict = {}fo
for read in ireg:
    if read.qname not in rdict:
        rdict[read.qname] = [read]
    else:
        rdict[read.qname].append(read)

clust = dict_to_clust(rdict)


# In[251]:


clust


# In[252]:


ref = get_ref_region(*reg)
ref


# In[255]:


print(ref[1][:80])
print(r1.seq)


# In[272]:


# how for ahead of reference is r1 start
ahead = -1 * (reg[1] - (r1.pos - 1))


# In[274]:


print(ref[1][:90])
print("-"*ahead, r1.seq)


# In[344]:


print('r', ref[1][:90])

for key in rdict:
    r1, r2 = rdict[key]
    
    ahead = -1 * (reg[1] - (r1.pos))
    print('h', ("-"*ahead + r1.seq)[:90])
    #print(r1.cigar)
    
print('r', ref[1][-90:])

for key in rdict:
    r1, r2 = rdict[key]
    
    ahead = (reg[2] - (r2.pos))
    
    print(reg[2], r2.pos, ahead)
    print('h', ("-"*ahead))
    #print(r1.cigar)

    #print(r2.seq)
    #print(r2.get_blocks())
    #print(r1.seq)
    #print(r1.get_blocks()[0])
    #print(r1.aend, r2.get_blocks()[0][0])
    #print(r1.cigar)
    print(r2.cigarstring)


# In[66]:


for read in clust:
    print(read)


# In[30]:


# access reads from bam file using pysam
samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')

# iterate over all mapped regions
clusters = []
for reg in regions:
    ireg = samfile.fetch(*reg)

    # match paired reads by read names for all reads in each cluster
    rdict = {}
    for read in ireg:
        if read.qname not in rdict:
            rdict[read.qname] = [read]
        else:
            rdict[read.qname].append(read)

    clust = dict_to_clust(rdict)


# In[31]:


rdict


# In[29]:


from ipyrad.assemble.util import revcomp


# In[106]:


def build_ref_clusters_from_cigars(data, sample):
    
    # get regions from bedtools overlaps
    regions = bedtools_merge(data, sample).strip().split("\n")
    regions = (i.split("\t") for i in regions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    samfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')
    
    # iterate over all mapped regions
    clusters = []
    for reg in regions:
        ireg = samfile.fetch(*reg)
    
        # match paired reads by read names for all reads in each cluster
        rdict = {}
        for read in ireg:
            if read.qname not in rdict:
                rdict[read.qname] = [read]
            else:
                rdict[read.qname].append(read)

        return dict_to_clust(rdict)


# In[109]:


seq = build_ref_clusters_from_cigars(data, sample)
seq


# In[27]:


def dict_to_clust(rdict):
    clust = []
    for pair in rdict.values():
        r1, r2 = pair
        poss = r1.get_reference_positions() + r2.get_reference_positions()
        seedstart, seedend = min(poss), max(poss)

        reads_overlap = False
        #print(r1.cigartuples, r1.cigarstring, r1.cigar)

        if r1.is_reverse:
            if r2.aend > r1.get_blocks()[0][0]:
                reads_overlap = True
                seq = r2.seq + 'nnnn' + revcomp(r1.seq)
            else:
                seq = r2.seq + 'nnnn' + r2.seq

        else:
            if r1.aend > r2.get_blocks()[0][0]:
                reads_overlap = True
                seq = r1.seq + 'nnnn' + revcomp(r2.seq)
            else:
                seq = r1.seq + 'nnnn' + r2.seq
        clust.append(seq)
    return clust


# In[8]:


# build clusters for aligning with muscle from the sorted bam file
samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')


# In[24]:


chrom, rstart, rend = regions[0].split()
reg = samfile.fetch(chrom, int(rstart), int(rend))

for read in reg:
    print(rstart)
    print(read.seq)


# In[ ]:


# In[9]:


nonmerged1 = os.path.join(
    data.tmpdir, 
    "{}_nonmerged_R1_.fastq".format(sample.name))
nonmerged2 = os.path.join(
    data.tmpdir, 
    "{}_nonmerged_R2_.fastq".format(sample.name))
edits1 = os.path.join(
    data.dirs.edits,
    "{}.trimmed_R1_.fq.gz".format(sample.name))
edits2 = os.path.join(
    data.dirs.edits, 
    "{}.trimmed_R2_.fq.gz".format(sample.name))

# file precedence
nonm1 = [i for i in (edits1, nonmerged1) if os.path.exists(i)][-1]
nonm2 = [i for i in (edits2, nonmerged2) if os.path.exists(i)][-1]


# In[10]:


edits1


# In[11]:


infiles = [
    os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
    os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
    os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
    os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]


# In[15]:


split_endtoend_reads(data, sample)


# In[11]:


with open(inp, 'r') as infile:
    duo = izip(*[iter(infile)] * 2)
    
    idx = 0
    while 1:
        try:
            itera = next(duo)
        except StopIteration:
            break
            
        r1, r2 = itera[1].split("nnnn")


# In[14]:


def split_endtoend_reads(data, sample):
    """
    Takes R1nnnnR2 derep reads from paired data and splits it back into 
    separate R1 and R2 parts for read mapping. 
    """

    inp = os.path.join(data.tmpdir, "{}_derep.fastq".format(sample.name))
    out1 = os.path.join(data.tmpdir, "{}_R1-tmp.fastq".format(sample.name))
    out2 = os.path.join(data.tmpdir, "{}_R2-tmp.fastq".format(sample.name))

    splitderep1 = open(out1, 'w')
    splitderep2 = open(out2, 'w')

    with open(inp, 'r') as infile: 
        # Read in the infile two lines at a time: (seqname, sequence)
        duo = izip(*[iter(infile)] * 2)

        ## lists for storing results until ready to write
        split1s = []
        split2s = []

        ## iterate over input splitting, saving, and writing.
        idx = 0
        while 1:
            try:
                itera = next(duo)
            except StopIteration:
                break
            ## split the duo into separate parts and inc counter
            part1, part2 = itera[1].split("nnnn")
            idx += 1

            ## R1 needs a newline, but R2 inherits it from the original file            
            ## store parts in lists until ready to write
            split1s.append("{}{}\n".format(itera[0], part1))
            split2s.append("{}{}".format(itera[0], part2))

            ## if large enough then write to file
            if not idx % 10000:
                splitderep1.write("".join(split1s))
                splitderep2.write("".join(split2s))
                split1s = []
                split2s = [] 

    ## write final chunk if there is any
    if any(split1s):
        splitderep1.write("".join(split1s))
        splitderep2.write("".join(split2s))

    ## close handles
    splitderep1.close()
    splitderep2.close()


# In[10]:


merge_pairs(data, sample, 1, 1)


# In[ ]:


# In[8]:


s3.data.samples["1A_0"].concatfiles


# In[6]:


s3.run()


# In[28]:


def bedtools_merge(data, sample):
    """ 
    Get all contiguous genomic regions with one or more overlapping
    reads. This is the shell command we'll eventually run

    bedtools bamtobed -i 1A_0.sorted.bam | bedtools merge [-d 100]
        -i <input_bam>  :   specifies the input file to bed'ize
        -d <int>        :   For PE set max distance between reads
    """
    LOGGER.info("Entering bedtools_merge: %s", sample.name)
    mappedreads = os.path.join(data.dirs.refmapping, 
                               sample.name + "-mapped-sorted.bam")

    ## command to call `bedtools bamtobed`, and pipe output to stdout
    ## Usage:   bedtools bamtobed [OPTIONS] -i <bam> 
    ## Usage:   bedtools merge [OPTIONS] -i <bam> 
    cmd1 = [ip.bins.bedtools, "bamtobed", "-i", mappedreads]
    cmd2 = [ip.bins.bedtools, "merge", "-i", "-"]

    ## If PE the -d flag to tell bedtools how far apart to allow mate pairs.
    ## If SE the -d flag is negative, specifying that SE reads need to
    ## overlap by at least a specific number of bp. This prevents the
    ## stairstep syndrome when a + and - read are both extending from
    ## the same cutsite. Passing a negative number to `merge -d` gets this done.
    if 'pair' in data.paramsdict["datatype"]:
        check_insert_size(data, sample)
        #cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"]))
        cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"]))
        cmd2.insert(2, "-d")
    else:
        cmd2.insert(2, str(-1 * data._hackersonly["min_SE_refmap_overlap"]))
        cmd2.insert(2, "-d")

    ## pipe output from bamtobed into merge
    LOGGER.info("stdv: bedtools merge cmds: %s %s", cmd1, cmd2)
    proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
    proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)
    result = proc2.communicate()[0].decode()
    proc1.stdout.close()

    ## check for errors and do cleanup
    if proc2.returncode:
        raise IPyradWarningExit("error in %s: %s", cmd2, result)

    ## Write the bedfile out, because it's useful sometimes.
    if os.path.exists(ip.__debugflag__):
        with open(os.path.join(data.dirs.refmapping, sample.name + ".bed"), 'w') as outfile:
            outfile.write(result)

    ## Report the number of regions we're returning
    nregions = len(result.strip().split("\n"))
    LOGGER.info("bedtools_merge: Got # regions: %s", nregions)
    return result


# In[27]:


bedtools_merge


# In[29]:


def check_insert_size(data, sample):
    """
    check mean insert size for this sample and update 
    hackersonly.max_inner_mate_distance if need be. This value controls how 
    far apart mate pairs can be to still be considered for bedtools merging 
    downstream.
    """

    ## pipe stats output to grep
    cmd1 = [
        ip.bins.samtools, 
        "stats", 
        os.path.join(
            data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)),
        ]
    cmd2 = ["grep", "SN"]
    proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
    proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)
    res = proc2.communicate()[0].decode()
    if proc2.returncode:
        raise IPyradWarningExit("error in %s: %s", cmd2, res)
        
    ## starting vals
    avg_insert = 0
    stdv_insert = 0
    avg_len = 0

    ## iterate over results
    for line in res.split("\n"):
        if "insert size average" in line:
            avg_insert = float(line.split(":")[-1].strip())

        elif "insert size standard deviation" in line:
            ## hack to fix sim data when stdv is 0.0. Shouldn't
            ## impact real data bcz stdv gets rounded up below
            stdv_insert = float(line.split(":")[-1].strip()) + 0.1
       
        elif "average length" in line:
            avg_len = float(line.split(":")[-1].strip())

    LOGGER.debug("avg {} stdv {} avg_len {}"
                 .format(avg_insert, stdv_insert, avg_len))

    ## If all values return successfully set the max inner mate distance.
    ## This is tricky. avg_insert is the average length of R1+R2+inner mate
    ## distance. avg_len is the average length of a read. If there are lots
    ## of reads that overlap then avg_insert will be close to but bigger than
    ## avg_len. We are looking for the right value for `bedtools merge -d`
    ## which wants to know the max distance between reads. 
    if all([avg_insert, stdv_insert, avg_len]):
        ## If 2 * the average length of a read is less than the average
        ## insert size then most reads DO NOT overlap
        if stdv_insert < 5:
            stdv_insert = 5.
        if (2 * avg_len) < avg_insert:
            hack = avg_insert + (3 * np.math.ceil(stdv_insert)) - (2 * avg_len)

        ## If it is > than the average insert size then most reads DO
        ## overlap, so we have to calculate inner mate distance a little 
        ## differently.
        else:
            hack = (avg_insert - avg_len) + (3 * np.math.ceil(stdv_insert))
            

        ## set the hackerdict value
        LOGGER.info("stdv: hacked insert size is %s", hack)
        data._hackersonly["max_inner_mate_distance"] = int(np.math.ceil(hack))

    else:
        ## If something fsck then set a relatively conservative distance
        data._hackersonly["max_inner_mate_distance"] = 300
        LOGGER.debug("inner mate distance for {} - {}".format(sample.name,\
                    data._hackersonly["max_inner_mate_distance"]))


# In[33]:


#from ipyrad.assemble.refmap import *
data = s3.data
samples = list(data.samples.values())
sample = samples[0]
regions = bedtools_merge(data, sample).strip().split("\n")


# In[34]:


nregions = len(regions)
chunksize = (nregions / 10) + (nregions % 10)


# In[43]:


from pysam import AlignmentFile
sample.files.mapped_reads = os.path.join(
    data.dirs.refmapping, sample.name + "-mapped-sorted.bam")
samfile = AlignmentFile(sample.files.mapped_reads, 'rb')
#"./tortas_refmapping/PZ70-mapped-sorted.bam", "rb")


# In[97]:


regions[1]


# In[170]:


samfile = pysam.AlignmentFile("ex1.sam", "r")


# In[262]:


a = pysam.AlignmentFile("./3-refsetest_refmapping/1A_0.sam")


# In[287]:


nthreads = 2
cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, nthreads)),
    "-M",
    data.paramsdict['reference_sequence'], 
    sample.concatfiles[0][0],
    sample.concatfiles[0][1] or "",
    ] 
cmd1 = [i for i in cmd1 if i]

# Insert optional flags for bwa
bwa_args = data._hackersonly["bwa_args"].split()
bwa_args.reverse()
for arg in bwa_args:
    cmd1.insert(2, arg)

with open(sample.files.sam, 'wb') as outfile:
    proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)
    error1 = proc1.communicate()[0]
    if proc1.returncode:
        raise IPyradError(error1)


# In[288]:


def bwa_map(data, sample, nthreads, force):
    """ 
    Map reads to reference sequence. This reads in the fasta files
    (samples.files.edits), and maps each read to the reference. Unmapped reads 
    are dropped right back in the de novo pipeline.  
    Mapped reads end up in a sam file.
    """

    sample.files.sam = os.path.join(
        data.dirs.refmapping, 
        "{}.sam".format(sample.name))

    sample.files.mapped_reads = os.path.join(
        data.dirs.refmapping,
        "{}-mapped-sorted.bam".format(sample.name))

    sample.files.unmapped_bam = os.path.join(
        data.dirs.refmapping,
        "{}-unmapped.bam".format(sample.name))

    sample.files.unmapped_reads = os.path.join(
        data.dirs.refmapping,
        "{}-unmapped.fastq".format(sample.name))


    # (cmd1) bwa mem [OPTIONS] <index_name> <file_name_A> [<file_name_B>] 
    #  -t #         : Number of threads
    #  -M           : Mark split alignments as secondary.

    # (cmd2) samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...] 
    #  -b = write to .bam
    #  -q = Only keep reads with mapq score >= 30 (seems to be pretty standard)
    #  -F = Select all reads that DON'T have these flags. 
    #        0x4 (segment unmapped)
    #        0x100 (Secondary alignment)
    #        0x800 (supplementary alignment)
    #  -U = Write out all reads that don't pass the -F filter 
    #       (all unmapped reads go to this file).

    # (cmd3) samtools sort [options...] [in.bam]
    #  -T = Temporary file name, this is required by samtools, ignore it
    #       Here we hack it to be samhandle.tmp cuz samtools cleans it up
    #  -O = Output file format, in this case bam
    #  -o = Output file name

    # (cmd5) samtools bam2fq -v 45 [in.bam]
    #   -v45 set the default qscore arbirtrarily high
    #
    cmd1 = [
        ip.bins.bwa, "mem",
        "-t", str(max(1, nthreads)),
        "-M",
        data.paramsdict['reference_sequence'], 
        sample.concatfiles[0][0],
        sample.concatfiles[0][1] or "",
        ] 
    cmd1 = [i for i in cmd1 if i]

    # Insert optional flags for bwa
    bwa_args = data._hackersonly["bwa_args"].split()
    bwa_args.reverse()
    for arg in bwa_args:
        cmd1.insert(2, arg)

    with open(sample.files.sam, 'wb') as outfile:
        proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)
        error1 = proc1.communicate()[0]
        if proc1.returncode:
            raise IPyradError(error1)

    # sends unmapped reads to a files and will PIPE mapped reads to cmd3
    cmd2 = [
        ip.bins.samtools, "view", 
        "-b", 
        "-F", "0x904",
        "-U", sample.files.unmapped_bam,
        sample.files.sam, 
        ]

    # this is gonna catch mapped bam output from cmd2 and write to file
    cmd3 = [
        ip.bins.samtools, "sort", 
        "-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"),
        "-O", "bam", 
        "-o", sample.files.mapped_reads]

    # Later we're gonna use samtools to grab out regions using 'view'
    cmd4 = [ipyrad.bins.samtools, "index", sample.files.mapped_reads]

    # convert unmapped reads to fastq
    cmd5 = [
        ip.bins.samtools, "bam2fq",
        "-v 45",
        sample.files.unmapped_bam, 
        ]

    # Insert additional arguments for paired data to the commands.
    # We assume Illumina paired end reads for the orientation 
    # of mate pairs (orientation: ---> <----). 
    if 'pair' in data.paramsdict["datatype"]:
        # add samtools filter for only keep if both pairs hit
        # 0x1 - Read is paired
        # 0x2 - Each read properly aligned
        cmd2.insert(2, "0x3")
        cmd2.insert(2, "-f")

        # tell bam2fq that there are output files for each read pair
        cmd5.insert(2, os.path.join(
            data.dirs.edits, sample.name + "-tmp-umap1.fastq"))
        cmd5.insert(2, "-1")
        cmd5.insert(2, os.path.join(
            data.dirs.edits, sample.name + "-tmp-umap2.fastq"))
        cmd5.insert(2, "-2")
    else:
        cmd5.insert(2, sample.files.unmapped_reads)
        cmd5.insert(2, "-0")


    # cmd2 writes to sname.unmapped.bam and fills pipe with mapped BAM data
    LOGGER.debug(" ".join(cmd2))
    proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)

    # cmd3 pulls mapped BAM from pipe and writes to sname.mapped-sorted.bam
    LOGGER.debug(" ".join(cmd3))
    proc3 = sps.Popen(cmd3, 
        stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)
    error3 = proc3.communicate()[0]
    if proc3.returncode:
        raise IPyradWarningExit(error3)
    proc2.stdout.close()

    # cmd4 indexes the bam file 
    LOGGER.debug(" ".join(cmd4))
    proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)
    error4 = proc4.communicate()[0]
    if proc4.returncode:
        raise IPyradWarningExit(error4)
    
    # Running cmd5 writes to either edits/sname-refmap_derep.fastq for SE
    # or it makes edits/sname-tmp-umap{12}.fastq for paired data, which 
    # will then need to be merged.
    LOGGER.debug(" ".join(cmd5))
    proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)
    error5 = proc5.communicate()[0]
    if proc5.returncode:
        raise IPyradWarningExit(error5)


# In[289]:


bwa_map(data, sample, 2, 1)


# In[ ]:


# In[205]:


cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, 2)),
    "-M",
    data.paramsdict['reference_sequence'], 
    sample.concatfiles[0][0],
    sample.concatfiles[0][1] or "",
    ] 

proc1 = sps.Popen(cmd1)#, stderr=cmd1_stderr, stdout=cmd1_stdout)
proc1.communicate()


# In[ ]:


pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam")


# In[182]:


iterreg = samfile.fetch("MT", 5009, 5100)
iterreg = samfile.fetch("MT", 10109, 10200)
iterreg = samfile.fetch("MT", 285510, 285600)


# In[181]:


for read in iterreg:
    print(read.seq)
    print(read.compare(read.get_reference_sequence()))


# In[104]:


it = samfile.pileup("MT", 5009, 5100)
pysam.Pileup.


# In[102]:


for read in iterreg:
    print(read.aligned_pairs)


# In[87]:


rdict = {}
for read in iterreg:
    if read.qname not in rdict:
        rdict[read.qname] = read


# In[80]:


read.seq


# In[84]:


read.cigar


# In[77]:


sorted(
    rdict.keys(),
    key=lambda x: int(x.split(""))


# In[65]:


it = samfile.pileup('MT', 5009, 5100)


# In[71]:


clusts = []
nclusts = 0
for region in regions:
    chrom, pos1, pos2 = region.split()
    
    clust = fetch_cluster_se(data, samfile, chrom, int(pos1), int(pos2))


# In[ ]:


def fetch_cluster_se(data, samfile, chrom, rstart, rend):
    """
    Builds a single end cluster from the refmapped data.
    """

    ## If SE then we enforce the minimum overlap distance to avoid the
    ## staircase syndrome of multiple reads overlapping just a little.
    overlap_buffer = data._hackersonly["min_SE_refmap_overlap"]

    ## the *_buff variables here are because we have to play patty
    ## cake here with the rstart/rend vals because we want pysam to
    ## enforce the buffer for SE, but we want the reference sequence
    ## start and end positions to print correctly for downstream.
    rstart_buff = rstart + overlap_buffer
    rend_buff = rend - overlap_buffer

    ## Reads that map to only very short segements of the reference
    ## sequence will return buffer end values that are before the
    ## start values causing pysam to complain. Very short mappings.
    if rstart_buff > rend_buff:
        tmp = rstart_buff
        rstart_buff = rend_buff
        rend_buff = tmp
    ## Buffering can't make start and end equal or pysam returns nothing.
    if rstart_buff == rend_buff:
        rend_buff += 1

    ## store pairs
    rdict = {}
    clust = []
    iterreg = []

    iterreg = samfile.fetch(chrom, rstart_buff, rend_buff)

    ## use dict to match up read pairs
    for read in iterreg:
        if read.qname not in rdict:
            rdict[read.qname] = read

    ## sort dict keys so highest derep is first ('seed')
    sfunc = lambda x: int(x.split(";size=")[1].split(";")[0])
    rkeys = sorted(rdict.keys(), key=sfunc, reverse=True)

    ## get blocks from the seed for filtering, bail out if seed is not paired
    try:
        read1 = rdict[rkeys[0]]
    except ValueError:
        LOGGER.error("Found bad cluster, skipping - key:{} rdict:{}".format(rkeys[0], rdict))
        return ""

    ## the starting blocks for the seed
    poss = read1.get_reference_positions(full_length=True)
    seed_r1start = min(poss)
    seed_r1end = max(poss)

    ## store the seed -------------------------------------------
    if read1.is_reverse:
        seq = revcomp(read1.seq)
    else:
        seq = read1.seq

    ## store, could write orient but just + for now.
    size = sfunc(rkeys[0])
    clust.append(">{}:{}:{};size={};*\n{}"\
        .format(chrom, seed_r1start, seed_r1end, size, seq))

    ## If there's only one hit in this region then rkeys will only have
    ## one element and the call to `rkeys[1:]` will raise. Test for this.
    if len(rkeys) > 1:
        ## store the hits to the seed -------------------------------
        for key in rkeys[1:]:
            skip = False
            try:
                read1 = rdict[key]
            except ValueError:
                ## enter values that will make this read get skipped
                read1 = rdict[key][0]
                skip = True

            ## orient reads only if not skipping
            if not skip:
                poss = read1.get_reference_positions(full_length=True)
                minpos = min(poss)
                maxpos = max(poss)
                ## store the seq
                if read1.is_reverse:
                    seq = revcomp(read1.seq)
                else:
                    seq = read1.seq
                ## store, could write orient but just + for now.
                size = sfunc(key)
                clust.append(">{}:{}:{};size={};+\n{}"\
                    .format(chrom, minpos, maxpos, size, seq))
            else:
                ## seq is excluded, though, we could save it and return
                ## it as a separate cluster that will be aligned separately.
                pass

    return clust    


# In[62]:


cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, nthreads)),
    "-M",
    data.paramsdict['reference_sequence'], 
    ] 
cmd1 += [i for i in sample.concatfiles[0] if i]

# Insert optional flags for bwa
bwa_args = data._hackersonly["bwa_args"].split()
bwa_args.reverse()
for arg in bwa_args:
    cmd1.insert(2, arg)

cmd1_stdout_handle = os.path.join(
    data.dirs.refmapping, sample.name + ".sam")
cmd1_stdout = open(cmd1_stdout_handle, 'w')
cmd1_stderr = None

proc1 = sps.Popen(cmd1, stderr=cmd1_stderr, stdout=cmd1_stdout)


# In[63]:


error1 = proc1.communicate()[0]


# In[65]:


cmd2 = [
    ip.bins.samtools, "view", 
    "-b", 
    #"-q", "30",
    "-F", "0x904",
    "-U", os.path.join(
        data.dirs.refmapping, sample.name + "-unmapped.bam"), 
    os.path.join(data.dirs.refmapping, sample.name + ".sam")]


# In[69]:


proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)
#proc2.communicate()


# In[74]:


proc3 = sps.Popen(cmd3, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)
proc3.communicate()


# In[77]:


proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)
proc4.communicate()


# In[81]:


sample.files.unmapped_reads = os.path.join(
    data.dirs.refmapping,
    "{}-unmapped.fastq".format(sample.name))


cmd4 = [ip.bins.samtools, "index", sample.files.mapped_reads]

# this is gonna read in the unmapped files, args are added below, 
# and it will output fastq formatted unmapped reads for merging.
# -v 45 sets the default qscore arbitrarily high
cmd5 = [
    ip.bins.samtools, "bam2fq",
    "-v 45",
    os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam")]

# Insert additional arguments for paired data to the commands.
# We assume Illumina paired end reads for the orientation 
# of mate pairs (orientation: ---> <----). 
if 'pair' in data.paramsdict["datatype"]:
    # add samtools filter for only keep if both pairs hit
    # 0x1 - Read is paired
    # 0x2 - Each read properly aligned
    cmd2.insert(2, "0x3")
    cmd2.insert(2, "-f")

    # tell bam2fq that there are output files for each read pair
    cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap1.fastq"))
    cmd5.insert(2, "-1")
    cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap2.fastq"))
    cmd5.insert(2, "-2")
else:
    cmd5.insert(2, sample.files.unmapped_reads)
    cmd5.insert(2, "-0")


# In[82]:


cmd5


# In[83]:


proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)
error5 = proc5.communicate()[0]


# In[73]:


sample.files.mapped_reads = os.path.join(
    data.dirs.refmapping,
    "{}-mapped-sorted.bam".format(sample.name))

# this is gonna catch mapped bam output from cmd2 and write to file
cmd3 = [
    ip.bins.samtools, "sort", 
    "-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"),
    "-O", "bam", 
    "-o", sample.files.mapped_reads]
cmd3


# In[53]:


data.dirs.refmapping


# In[17]:


nthreads = 2


cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, nthreads)),
    "-M",
    data.paramsdict['reference_sequence']
    ] 

cmd1
#cmd1 += sample.files.dereps


# In[ ]:


# In[4]:


s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
sample = s3.samples[0]
s3.run()


# In[7]:


#derep_sort_map(s3.data, sample, s3.force, s3.nthreads)
sample.concatfiles = concat_multiple_edits(s3.data, sample)


# In[13]:


sample.mergedfile = merge_pairs(s3.data, sample, 1, 1)  
sample.mergedfile


# In[12]:


new_derep_and_sort(s3.data, sample.mergedfile, os.path.join(s3.data.tmpdir, sample.name+ "_derep.fastq"), s3.nthreads)


# In[ ]:


strand = "plus"
if s3.data.paramsdict["datatype"] is ('gbs' or '2brad'):
    strand = "both"


cmd = [
    ip.bins.vsearch,
    "--derep_fulllength", sample.mergedfile,
    "--strand", strand,
    "--output", outfile,
    "--threads", str(nthreads),
    "--fasta_width", str(0),
    "--fastq_qmax", "1000",
    "--sizeout", 
    "--relabel_md5",
    ]


# In[14]:


s3.data.run("4")


# In[14]:


s3.remote_run_dereps()
#s3.remote_run_cluster_map_build()


# In[15]:


for sample in s3.samples:
    cluster(s3.data, sample, s3.nthreads, s3.force)


# In[8]:


for sample in s3.samples:
    build_clusters(s3.data, sample, s3.maxindels)


# In[26]:


for sample in s3.samples:
    muscle_chunker(s3.data, sample)


# In[33]:


aasyncs = {}
for sample in s3.samples:
    aasyncs[sample.name] = []
    for idx in range(10):
        handle = os.path.join(s3.data.tmpdir, 
            "{}_chunk_{}.ali".format(sample.name, idx))
        align_and_parse(handle, s3.maxindels, s3.gbs)


# In[32]:


def _get_derep_num(read):
    "return the number of replicates in a derep read"
    return int(read.split("=")[-1].split("\n")[0][:-1])


def _aligned_indel_filter(clust, max_internal_indels):
    """ checks for too many internal indels in muscle aligned clusters """

    ## make into list
    lclust = clust.split()
    
    ## paired or not
    try:
        seq1 = [i.split("nnnn")[0] for i in lclust[1::2]]
        seq2 = [i.split("nnnn")[1] for i in lclust[1::2]]
        intindels1 = [i.rstrip("-").lstrip("-").count("-") for i in seq1]
        intindels2 = [i.rstrip("-").lstrip("-").count("-") for i in seq2]
        intindels = intindels1 + intindels2
        if max(intindels) > max_internal_indels:
            return 1
    except IndexError:
        seq1 = lclust[1::2]
        intindels = [i.rstrip("-").lstrip("-").count("-") for i in seq1]
        if max(intindels) > max_internal_indels:
            return 1     
    return 0


# In[34]:


for sample in s3.samples:
    reconcat(s3.data, sample)


# In[27]:


def align_and_parse(handle, max_internal_indels=5, is_gbs=False):
    """ much faster implementation for aligning chunks """

    ## data are already chunked, read in the whole thing. bail if no data.
    try:
        with open(handle, 'rb') as infile:
            clusts = infile.read().decode().split("//\n//\n")
            # remove any empty spots
            clusts = [i for i in clusts if i]
            # Skip entirely empty chunks
            if not clusts:
                raise IPyradError("no clusters in file: {}".format(handle))

    except (IOError, IPyradError):
        LOGGER.debug("skipping empty chunk - {}".format(handle))
        return 0

    ## count discarded clusters for printing to stats later
    highindels = 0

    ## iterate over clusters sending each to muscle, splits and aligns pairs
    aligned = _persistent_popen_align3(clusts, 200, is_gbs)

    ## store good alignments to be written to file
    refined = []

    ## filter and trim alignments
    for clust in aligned:
        # check for too many internal indels
        if not _aligned_indel_filter(clust, max_internal_indels):
            refined.append(clust)
        else:
            highindels += 1

    ## write to file after
    if refined:
        outhandle = handle.rsplit(".", 1)[0] + ".aligned"
        with open(outhandle, 'wb') as outfile:
            outfile.write(str.encode("\n//\n//\n".join(refined) + "\n"))

    ## remove the old tmp file
    if not LOGGER.getEffectiveLevel() == 10:
        os.remove(handle)
    return highindels


# In[28]:


def _persistent_popen_align3(clusts, maxseqs=200, is_gbs=False):
    """ keeps a persistent bash shell open and feeds it muscle alignments """

    ## create a separate shell for running muscle in, this is much faster
    ## than spawning a separate subprocess for each muscle call
    proc = sps.Popen(
        ["bash"], 
        stdin=sps.PIPE, 
        stdout=sps.PIPE, 
        bufsize=0,
        )

    ## iterate over clusters in this file until finished
    aligned = []
    for clust in clusts:

        ## new alignment string for read1s and read2s
        align1 = []
        align2 = []

        ## don't bother aligning if only one seq
        if clust.count(">") == 1:
            aligned.append(clust.replace(">", "").strip())
        else:

            # do we need to split the alignment? (is there a PE insert?)
            try:
                # make into list (only read maxseqs lines, 2X cuz names)
                lclust = clust.split()[:maxseqs * 2]

                # try to split cluster list at nnnn separator for each read
                lclust1 = list(chain(*zip(
                    lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]])))
                lclust2 = list(chain(*zip(
                    lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]])))

                # put back into strings
                clust1 = "\n".join(lclust1)
                clust2 = "\n".join(lclust2)

                # Align the first reads.
                # The muscle command with alignment as stdin and // as split
                cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                        .format(clust1, ip.bins.muscle, "//\n"))

                # send cmd1 to the bash shell
                proc.stdin.write(cmd1.encode())

                # read the stdout by line until splitter is reached
                # meaning that the alignment is finished.
                for line in iter(proc.stdout.readline, b'//\n'):
                    align1.append(line.decode())

                # Align the second reads.
                # The muscle command with alignment as stdin and // as split
                cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                        .format(clust2, ip.bins.muscle, "//\n"))

                # send cmd2 to the bash shell
                proc.stdin.write(cmd2.encode())

                # read the stdout by line until splitter is reached
                # meaning that the alignment is finished.
                for line in iter(proc.stdout.readline, b'//\n'):
                    align2.append(line.decode())

                # join up aligned read1 and read2 and ensure names order match
                lines1 = "".join(align1)[1:].split("\n>")
                lines2 = "".join(align2)[1:].split("\n>")
                dalign1 = dict([i.split("\n", 1) for i in lines1])
                dalign2 = dict([i.split("\n", 1) for i in lines2])

                # sort the first reads
                keys = list(dalign1.keys())
                seed = [i for i in keys if i[-1] == "*"][0]
                keys.pop(keys.index(seed))
                order = [seed] + sorted(
                    keys, key=_get_derep_num, reverse=True)                

                # combine in order
                alignpe = []                
                for key in order:
                    alignpe.append("\n".join([
                        key, 
                        dalign1[key].replace("\n", "") + "nnnn" + \
                        dalign2[key].replace("\n", "")]))

                ## append aligned cluster string
                aligned.append("\n".join(alignpe).strip())

            # Malformed clust. Dictionary creation with only 1 element 
            except ValueError as inst:
                ip.logger.debug(
                    "Bad PE cluster - {}\nla1 - {}\nla2 - {}"
                    .format(clust, lines1, lines2))

            ## Either reads are SE, or at least some pairs are merged.
            except IndexError:
                    
                # limit the number of input seqs
                # use lclust already built before checking pairs
                lclust = "\n".join(clust.split()[:maxseqs * 2])

                # the muscle command with alignment as stdin and // as splitter
                cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                       .format(lclust, ip.bins.muscle, "//\n"))

                ## send cmd to the bash shell (TODO: PIPE could overflow here!)
                proc.stdin.write(cmd.encode())

                ## read the stdout by line until // is reached. This BLOCKS.
                for line in iter(proc.stdout.readline, b'//\n'):
                    align1.append(line.decode())

                ## remove '>' from names, and '\n' from inside long seqs                
                lines = "".join(align1)[1:].split("\n>")

                ## find seed of the cluster and put it on top.
                seed = [i for i in lines if i.split(";")[-1][0] == "*"][0]
                lines.pop(lines.index(seed))
                lines = [seed] + sorted(
                    lines, key=_get_derep_num, reverse=True)

                ## format remove extra newlines from muscle
                aa = [i.split("\n", 1) for i in lines]
                align1 = [i[0] + '\n' + "".join([j.replace("\n", "") 
                          for j in i[1:]]) for i in aa]
                
                # trim edges in sloppy gbs/ezrad data. 
                # Maybe relevant to other types too...
                if is_gbs:
                    align1 = _gbs_trim(align1)

                ## append to aligned
                aligned.append("\n".join(align1))
               
    # cleanup
    proc.stdout.close()
    if proc.stderr:
        proc.stderr.close()
    proc.stdin.close()
    proc.wait()

    ## return the aligned clusters
    return aligned  


# In[35]:


for sample in s3.samples:
    ip.assemble.clustmap._sample_cleanup(s3.data, sample)


# In[36]:


ip.assemble.clustmap._get_quick_depths(s3.data, sample)


# In[37]:


if sample.files.get('clusters'):
    pass


# In[43]:


fclust = data.samples[sample.name].files.clusters
clusters = gzip.open(fclust, 'rt')
pairdealer = izip(*[iter(clusters)] * 2)

## storage
depths = []
maxlen = []

## start with cluster 0
tdepth = 0
tlen = 0

## iterate until empty
while 1:
    ## grab next
    try:
        name, seq = next(pairdealer)
    except StopIteration:
        break

    ## if not the end of a cluster
    #print name.strip(), seq.strip()
    #print(name)
    if name.strip() == seq.strip():
        depths.append(tdepth)
        maxlen.append(tlen)
        tlen = 0
        tdepth = 0

    else:
        tdepth += int(name.strip().split("=")[-1][:-1])
        tlen = len(seq)


# In[12]:


sample.name


# In[ ]:


# In[6]:


s3.remote_run_dereps()


# In[7]:


s3.remote_run_cluster_map_build()


# In[8]:


s3.remote_run_align_cleanup()


# In[90]:


handle = "pairtest-tmpalign/1A_0_chunk_0.ali"
with open(handle, 'rb') as infile:
    clusts = infile.read().decode().split("//\n//\n")
    # remove any empty spots
    clusts = [i for i in clusts if i]
    # Skip entirely empty chunks
    if not clusts:
        raise IPyradError("no clusters in file: {}".format(handle))


# In[91]:


maxseqs = 200
is_gbs = False


# In[ ]:


proc = sps.Popen(
    ["bash"], 
    stdin=sps.PIPE, 
    stdout=sps.PIPE, 
    bufsize=0,
    )

## iterate over clusters in this file until finished
aligned = []
for clust in clusts:

    ## new alignment string for read1s and read2s
    align1 = []
    align2 = []

    ## don't bother aligning if only one seq
    if clust.count(">") == 1:
        aligned.append(clust.replace(">", "").strip())
    else:

        # do we need to split the alignment? (is there a PE insert?)
        try:
            # make into list (only read maxseqs lines, 2X cuz names)
            lclust = clust.split()[:maxseqs * 2]

            # try to split cluster list at nnnn separator for each read
            lclust1 = list(chain(*zip(
                lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]])))
            lclust2 = list(chain(*zip(
                lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]])))

            # put back into strings
            clust1 = "\n".join(lclust1)
            clust2 = "\n".join(lclust2)

            # Align the first reads.
            # The muscle command with alignment as stdin and // as split
            cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(clust1, ip.bins.muscle, "//\n"))

            # send cmd1 to the bash shell
            proc.stdin.write(cmd1.encode())

            # read the stdout by line until splitter is reached
            # meaning that the alignment is finished.
            for line in iter(proc.stdout.readline, '//\n'):
                align1.append(line)

            # Align the second reads.
            # The muscle command with alignment as stdin and // as split
            cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(clust2, ip.bins.muscle, "//\n"))

            # send cmd2 to the bash shell
            proc.stdin.write(cmd2.encode())

            # read the stdout by line until splitter is reached
            # meaning that the alignment is finished.
            for line in iter(proc.stdout.readline, b'//\n'):
                align2.append(line)

            # join up aligned read1 and read2 and ensure names order match
            lines1 = "".join(align1)[1:].split("\n>")
            lines2 = "".join(align2)[1:].split("\n>")
            dalign1 = dict([i.split("\n", 1) for i in lines1])
            dalign2 = dict([i.split("\n", 1) for i in lines2])

            # sort the first reads
            keys = list(dalign1.keys())
            seed = [i for i in keys if i[-1] == "*"][0]
            keys.pop(keys.index(seed))
            order = [seed] + sorted(
                keys, key=_get_derep_num, reverse=True)                

            # combine in order
            for key in order:
                align1.append("\n".join([
                    key, 
                    dalign1[key].replace("\n", "") + "nnnn" + \
                    dalign2[key].replace("\n", "")]))

            ## append aligned cluster string
            aligned.append("\n".join(align1).strip())

        # Malformed clust. Dictionary creation with only 1 element 
        except ValueError as inst:
            ip.logger.debug(
                "Bad PE cluster - {}\nla1 - {}\nla2 - {}"
                .format(clust, lines1, lines2))

        ## Either reads are SE, or at least some pairs are merged.
        except IndexError:

            # limit the number of input seqs
            # use lclust already built before checking pairs
            lclust = "\n".join(clust.split()[:maxseqs * 2])

            # the muscle command with alignment as stdin and // as splitter
            cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                   .format(lclust, ip.bins.muscle, "//\n"))

            ## send cmd to the bash shell (TODO: PIPE could overflow here!)
            proc.stdin.write(cmd.encode())

            ## read the stdout by line until // is reached. This BLOCKS.
            for line in iter(proc.stdout.readline, b'//\n'):
                align1.append(line.decode())

            ## remove '>' from names, and '\n' from inside long seqs                
            lines = "".join(align1)[1:].split("\n>")

            ## find seed of the cluster and put it on top.
            seed = [i for i in lines if i.split(";")[-1][0] == "*"][0]
            lines.pop(lines.index(seed))
            lines = [seed] + sorted(
                lines, key=_get_derep_num, reverse=True)

            ## format remove extra newlines from muscle
            aa = [i.split("\n", 1) for i in lines]
            align1 = [i[0] + '\n' + "".join([j.replace("\n", "") 
                      for j in i[1:]]) for i in aa]

            # trim edges in sloppy gbs/ezrad data. 
            # Maybe relevant to other types too...
            if is_gbs:
                align1 = _gbs_trim(align1)

            ## append to aligned
            aligned.append("\n".join(align1))

# cleanup
proc.stdout.close()
if proc.stderr:
    proc.stderr.close()
proc.stdin.close()
proc.wait()

## return the aligned clusters
#return aligned   


# In[ ]:


# In[37]:


from ipyrad.assemble.clustmap import _get_derep_num


# In[80]:


# join up aligned read1 and read2 and ensure names order match
lines1 = "".join(align1)[1:].split("\n>")
lines2 = "".join(align2)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines1])
dalign2 = dict([i.split("\n", 1) for i in lines2])

# sort the first reads
keys = list(dalign1.keys())
seed = [i for i in keys if i[-1] == "*"][0]
keys.pop(keys.index(seed))
order = [seed] + sorted(
    keys, key=_get_derep_num, reverse=True)     


# In[82]:


# join up aligned read1 and read2 and ensure names order match
lines1 = "".join(align1)[1:].split("\n>")
lines2 = "".join(align2)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines1])
dalign2 = dict([i.split("\n", 1) for i in lines2])

# sort the first reads
keys = list(dalign1.keys())
seed = [i for i in keys if i[-1] == "*"][0]
keys.pop(keys.index(seed))
order = [seed] + sorted(
    keys, key=_get_derep_num, reverse=True)                

# combine in order
for key in order:
    align1.append("\n".join([
        key, 
        dalign1[key].replace("\n", "") + "nnnn" + \
        dalign2[key].replace("\n", "")]))

## append aligned cluster string
aligned.append("\n".join(align1).strip())


# In[83]:


aligned


# In[57]:


## remove '>' from names, and '\n' from inside long seqs                
lines1 = "".join(align1)[1:].split("\n>")
seed = [i for i in lines1 if i.split(";")[-1][0] == "*"][0]
lines1.pop(lines1.index(seed))
lines1 = [seed] + sorted(
    lines1, key=_get_derep_num, reverse=True)
dalign1 = dict([i.split("\n", 1) for i in lines1])


lines2 = "".join(align2)[1:].split("\n>")
dalign2 = dict([i.split("\n", 1) for i in lines2])
#seed = [i for i in lines2 if i.split(";")[-1][0] == "*"][0]
#lines2.pop(lines2.index(seed))


# In[24]:


## format remove extra newlines from muscle
aa = [i.split("\n", 1) for i in lines]
align1 = [i[0] + '\n' + "".join([j.replace("\n", "") 
          for j in i[1:]]) for i in aa]


# In[25]:


align1


# In[16]:


derephandle = os.path.join(data.tmpdir, sample.name + "_derep.fastq")
uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
temphandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")


# In[19]:


derepfile = os.path.join(data.tmpdir, sample.name + "_derep.fastq")
uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort")
hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
sample.files.clusters = os.path.join(
    data.dirs.clusts, sample.name + ".clust.gz")
clustsout = gzip.open(sample.files.clusters, 'wt')


# In[20]:


cmd = ["sort", "-k", "2", uhandle, "-o", usort]
proc = sps.Popen(cmd, close_fds=True)
proc.communicate()[0]


# In[21]:


alldereps = {}
with open(derepfile, 'rt') as ioderep:
    dereps = izip(*[iter(ioderep)] * 2)
    for namestr, seq in dereps:
        nnn, sss = [i.strip() for i in (namestr, seq)]  
        alldereps[nnn[1:]] = sss


# In[25]:


seedsseen = set()
maxindels = 8

## Iterate through the usort file grabbing matches to build clusters
with open(usort, 'rt') as insort:
    ## iterator, seed null, seqlist null
    isort = iter(insort)
    lastseed = 0
    fseqs = []
    seqlist = []
    seqsize = 0
    while 1:
        ## grab the next line
        try:
            hit, seed, _, ind, ori, _ = next(isort).strip().split()
        except StopIteration:
            break

        ## same seed, append match
        if seed != lastseed:
            seedsseen.add(seed)
            ## store the last cluster (fseq), count it, and clear fseq
            if fseqs:
                ## sort fseqs by derep after pulling out the seed
                fseqs = [fseqs[0]] + sorted(fseqs[1:], 
                    key=lambda x: 
                    int(x.split(";size=")[1].split(";")[0]), reverse=True)                    
                seqlist.append("\n".join(fseqs))
                seqsize += 1
                fseqs = []

            # occasionally write/dump stored clusters to file and clear mem
            if not seqsize % 10000:
                if seqlist:
                    clustsout.write(
                        "\n//\n//\n".join(seqlist) + "\n//\n//\n")
                    ## reset list and counter
                    seqlist = []

            ## store the new seed on top of fseq list
            fseqs.append(">{}*\n{}".format(seed, alldereps[seed]))
            lastseed = seed

        ## add match to the seed
        ## revcomp if orientation is reversed (comp preserves nnnn)
        if ori == "-":
            seq = comp(alldereps[hit])[::-1]
        else:
            seq = alldereps[hit]
        ## only save if not too many indels
        if int(ind) <= maxindels:
            fseqs.append(">{}{}\n{}".format(hit, ori, seq))
        else:
            ip.logger.info("filtered by maxindels: %s %s", ind, seq)

## write whatever is left over to the clusts file
if fseqs:
    seqlist.append("\n".join(fseqs))
if seqlist:
    clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n")

## now write the seeds that had no hits. Make dict from htemp
with open(hhandle, 'rt') as iotemp:
    nohits = izip(*[iter(iotemp)] * 2)
    seqlist = []
    seqsize = 0
    while 1:
        try:
            nnn, _ = [i.strip() for i in next(nohits)]
        except StopIteration:
            break

        ## occasionally write to file
        if not seqsize % 10000:
            if seqlist:
                clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n")
                ## reset list and counter
                seqlist = []

        ## append to list if new seed
        if nnn[1:] not in seedsseen:
            seqlist.append("{}*\n{}".format(nnn, alldereps[nnn[1:]]))
            seqsize += 1

## write whatever is left over to the clusts file
if seqlist:
    clustsout.write("\n//\n//\n".join(seqlist))

## close the file handle
clustsout.close()
del alldereps


# In[28]:


seedsseen


# In[6]:


sample.concatfiles = concat_multiple_edits(data, sample)


# In[7]:


sample.mergedfile = merge_pairs(data, sample, 1, 1)  


# In[10]:


new_derep_and_sort(
    data,
    sample.mergedfile,
    os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
    2)


# In[6]:


data = data
sample = sample
revcomp = 1
vsearch_merge = 1

sample.concatfiles = concat_multiple_edits(data, sample)
sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")       


# In[8]:


sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")       
if 'pair' in data.paramsdict['datatype']:
    if "reference" not in data.paramsdict["assembly_method"]:
        nmerged = ip.assemble.clustmap._merge_pairs(data, sample, 1, 1)
    else:
        nmerged = 0  # _merge_pairs(data, sample, 0, 0)          
    sample.stats.reads_merged = nmerged


# In[ ]:


new_derep_and_sort(data, sampl)


# In[10]:


def new_derep_and_sort(data, infile, outfile, nthreads):
    """
    Dereplicates reads and sorts so reads that were highly replicated are at
    the top, and singletons at bottom, writes output to derep file. Paired
    reads are dereplicated as one concatenated read and later split again.
    Updated this function to take infile and outfile to support the double
    dereplication that we need for 3rad (5/29/15 iao).
    """
    ## datatypes options
    strand = "plus"
    if data.paramsdict["datatype"] is ('gbs' or '2brad'):
        strand = "both"

    ## do dereplication with vsearch
    cmd = [
        ip.bins.vsearch,
        "--derep_fulllength", infile,
        "--strand", strand,
        "--output", outfile,
        "--threads", str(nthreads),
        "--fasta_width", str(0),
        "--fastq_qmax", "1000",
        "--sizeout", 
        "--relabel_md5",
        ]
    ip.logger.info("derep cmd %s", cmd)

    ## build PIPEd job
    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
    errmsg = proc.communicate()[0]
    if proc.returncode:
        ip.logger.error("error inside derep_and_sort %s", errmsg)
        raise IPyradWarningExit(errmsg)


# In[ ]:


new_derep_and_sort()


# In[43]:


## CONCAT FILES FOR MERGED ASSEMBLIES
mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")
sample.files.edits = concat_multiple_edits(data, sample)


# In[41]:


sample.files.edits = [(mergefile, )]
nmerged = ip.assemble.clustmap._merge_pairs(
    data, sample.files.edits, mergefile, 1, 1)
sample.stats.reads_merged = nmerged


# In[33]:


mergefile


# In[44]:


sample.files.edits


# In[5]:


s3.run()


# In[5]:


# muscle align returns values for bad alignments
ip.assemble.cluster_within.sample_cleanup(data, samples[0])


# In[6]:


data._build_stat("s3")


# In[7]:


raise IPyradError("hi")


# In[4]:


self = data
derep_sort_map(s3.data, samples[0], s3.force)


# In[11]:


ra.exception()


# In[ ]:


data.get_params()