#!/usr/bin/env python
# coding: utf-8

# In[1]:


from ipyrad.assemble.clustmap import *
from ipyrad.assemble.cluster_across import *
import ipyparallel as ipp


# In[2]:


data = ip.Assembly("5-cyatho")
data.set_params("project_dir", "5-cyatho")
data.set_params("sorted_fastq_path", "./example_empirical_rad/*.gz")
data.set_params("filter_adapters", 2)
data.set_params("clust_threshold", .90)


# In[17]:


data.run("12")


# In[2]:


from ipyrad.assemble.clustmap import *
ipyclient = ipp.Client()
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
samples = list(s3.data.samples.values())
sample = samples[1]


# In[19]:


s3.run()


# In[21]:


data.run("45")


# In[3]:


data = ip.load_json("5-cyatho/5-cyatho.json")
samples = list(data.samples.values())
ipyclient = ipp.Client()


# In[4]:


from ipyrad.assemble.cluster_across import *
popmins = {'o': 0, 'c': 3, 'r': 3}
popdict = {
    'o': ['33588_przewalskii', '32082_przewalskii'],
    'c': ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides'],
    'r': [ '35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'],
}
data._link_populations(popdict, popmins)
data.populations


# In[5]:


s6 = Step6(data, samples, ipyclient, 0, 0)


# In[7]:


s6.remote_build_concats_tier1()


# In[8]:


s6.remote_cluster1()


# In[9]:


s6.remote_build_concats_tier2()


# In[10]:


s6.remote_cluster2()


# In[17]:


s6.remote_build_denovo_clusters()


# In[13]:


s6.remote_align_denovo_clusters()


# In[30]:


chunk = "./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760"
align_to_array(s6.data, s6.samples, chunk)


# In[29]:


def align_to_array(data, samples, chunk):

    # data are already chunked, read in the whole thing
    with open(chunk, 'rt') as infile:
        clusts = infile.read().split("//\n//\n")[:-1]    

    # snames to ensure sorted order
    samples.sort(key=lambda x: x.name)
    snames = [sample.name for sample in samples]

    # make a tmparr to store metadata (this can get huge, consider using h5)
    maxvar = 25
    maxlen = data._hackersonly["max_fragment_length"] + 20
    maxinds = data.paramsdict["max_Indels_locus"]
    ifilter = np.zeros(len(clusts), dtype=np.bool_)
    dfilter = np.zeros(len(clusts), dtype=np.bool_)
    varpos = np.zeros((len(clusts), maxvar), dtype='u1')
    indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
    edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
    consens = np.zeros((len(clusts), maxlen), dtype="S1")

    # create a persistent shell for running muscle in. 
    proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)

    # iterate over clusters until finished
    allstack = []
    for ldx in range(len(clusts)):
        istack = []
        lines = clusts[ldx].strip().split("\n")
        names = lines[::2]
        seqs = lines[1::2]
        seqs = [i[:90] for i in seqs]    
        align1 = []
        
        # find duplicates and skip aligning but keep it for downstream.
        unames = set([i.rsplit("_", 1)[0] for i in names])
        if len(unames) < len(names):
            dfilter[ldx] = True
            istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]

        else:
            # append counter to names because muscle doesn't retain order
            nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]

            # make back into strings
            cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])                

            # store allele (lowercase) info
            amask, abool = store_alleles(seqs)

            # send align1 to the bash shell (TODO: check for pipe-overflow)
            cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(cl1, ipyrad.bins.muscle, "//\n"))
            proc.stdin.write(cmd1.encode())

            # read the stdout by line until splitter is reached
            for line in iter(proc.stdout.readline, b'//\n'):
                align1.append(line.decode())

            # reorder b/c muscle doesn't keep order
            lines = "".join(align1)[1:].split("\n>")
            dalign1 = dict([i.split("\n", 1) for i in lines])
            keys = sorted(
                dalign1.keys(), 
                key=lambda x: int(x.rsplit("*")[-1])
            )
            seqarr = np.zeros(
                (len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
                dtype='S1',
                )
            for kidx, key in enumerate(keys):
                concatseq = dalign1[key].replace("\n", "")
                seqarr[kidx] = list(concatseq)
                    
                # fill in edges for each seq
                edg = (
                    len(concatseq) - len(concatseq.lstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    )
                sidx = snames.index(key.rsplit('_', 1)[0])
                edges[ldx, sidx] = edg

            # get ref/variant counts in an array
            varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
            constmp = varmat[:, 0].view('S1')
            consens[ldx, :constmp.size] = constmp
            varstmp = np.nonzero(varmat[:, 1])[0]
            varpos[ldx, :varstmp.size] = varstmp 

            # impute allele information back into the read b/c muscle
            wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])

            # sort in sname (alphanumeric) order for indels storage
            for idx in wkeys:
                # seqarr and amask are in input order here
                args = (seqarr, idx, amask)
                seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
                sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
                if indidx.size:
                    indsize = min(indidx.size, sum(maxinds))
                    indels[ldx, sidx, :indsize] = indidx[:indsize]
                wname = names[idx]

                # store (only for visually checking alignments)
                istack.append(
                    "{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))

            # indel filter
            if indels.sum(axis=1).max() >= sum(maxinds):
                ifilter[ldx] = True

        # store the stack (only for visually checking alignments)
        if istack:
            allstack.append("\n".join(istack))

    # cleanup
    proc.stdout.close()
    if proc.stderr:
        proc.stderr.close()
    proc.stdin.close()
    proc.wait()

    # write to file after (only for visually checking alignments)
    odx = chunk.rsplit("_")[-1]
    alignfile = os.path.join(data.tmpdir, "aligned_{}.fa".format(odx))
    with open(alignfile, 'wt') as outfile:
        outfile.write("\n//\n//\n".join(allstack) + "\n")
    #os.remove(chunk)

    # save indels array to tmp dir
    np.save(
        os.path.join(data.tmpdir, "ifilter_{}.tmp.npy".format(odx)),
        ifilter)
    np.save(
        os.path.join(data.tmpdir, "dfilter_{}.tmp.npy".format(odx)),
        dfilter)


# In[13]:


data = ip.load_json("4-refpairtest.json")
samples = list(data.samples.values())
sample = samples[0]
force = True
ipyclient = ipp.Client()


# In[3]:


data.samples


# In[4]:


popmins = {'1': 3, '2': 3, '3': 3}
popdict = {
    '1': ['1A_0', '1B_0', '1C_0', '1D_0'], 
    '2': ['2E_0', '2F_0', '2G_0', '2H_0'],
    '3': ['3I_0', '3J_0', '3K_0', '3L_0'],
}

data._link_populations(popdict, popmins)
data.populations


# In[5]:


s = Step6(data, samples, ipyclient, 123, True)
s.samples


# In[6]:


s.remote_build_concats_tier1()


# In[7]:


s.remote_cluster1()


# In[8]:


s.remote_build_concats_tier2()


# In[9]:


s.remote_cluster2()


# In[10]:


s.remote_build_denovo_clusters()


# In[11]:


chunks = glob.glob('4-refpairtest_across/4-refpairtest-tmpalign/*.chunk*')
for chunk in chunks:
    align_to_array(s.data, s.samples, chunk)


# In[317]:


chunk = "4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_70"
samples = s.samples
data = s.data

# data are already chunked, read in the whole thing
#with open(chunk, 'rt') as infile:
#    clusts = infile.read().split("//\n//\n")[:-1]    


# In[389]:


nnames


# In[451]:


def align_to_array(data, samples, chunk):

    # data are already chunked, read in the whole thing
    with open(chunk, 'rt') as infile:
        clusts = infile.read().split("//\n//\n")[:-1]    

    # snames to ensure sorted order
    samples.sort(key=lambda x: x.name)
    snames = [sample.name for sample in samples]

    # make a tmparr to store metadata (this can get huge, consider using h5)
    maxvar = 25
    maxlen = data._hackersonly["max_fragment_length"] + 20
    maxinds = data.paramsdict["max_Indels_locus"]
    ifilter = np.zeros(len(clusts), dtype=np.bool_)
    dfilter = np.zeros(len(clusts), dtype=np.bool_)
    varpos = np.zeros((len(clusts), maxvar), dtype='u1')
    indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
    edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
    consens = np.zeros((len(clusts), maxlen), dtype="S1")

    # create a persistent shell for running muscle in. 
    proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)

    # iterate over clusters until finished
    allstack = []
    for ldx in range(len(clusts)):
        istack = []
        lines = clusts[ldx].strip().split("\n")
        names = lines[::2]
        seqs = lines[1::2]        
        align1 = []
        
        # find duplicates and skip aligning/ordering since it will be filtered.
        unames = set([i.rsplit("_", 1)[0] for i in names])
        if len(unames) < len(names):
            dfilter[ldx] = True
            istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]

        else:
            # append counter to names because muscle doesn't retain order
            nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]

            # make back into strings
            cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])                

            # store allele (lowercase) info on seqs
            amask, abool = store_alleles(seqs)

            # send align1 to the bash shell (TODO: check for pipe-overflow)
            cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(cl1, ipyrad.bins.muscle, "//\n"))
            proc.stdin.write(cmd1.encode())

            # read the stdout by line until splitter is reached
            for line in iter(proc.stdout.readline, b'//\n'):
                align1.append(line.decode())

            # reorder into original order and arrange into a dictionary and arr
            lines = "".join(align1)[1:].split("\n>")
            dalign1 = dict([i.split("\n", 1) for i in lines])
            keys = sorted(
                dalign1.keys(), 
                key=lambda x: int(x.rsplit("*")[-1])
            )
            seqarr = np.zeros(
                (len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
                dtype='S1',
                )
            for kidx, key in enumerate(keys):
                concatseq = dalign1[key].replace("\n", "")
                seqarr[kidx] = list(concatseq)
                    
                # fill in edges for each seq
                edg = (
                    len(concatseq) - len(concatseq.lstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    )
                sidx = snames.index(key.rsplit('_', 1)[0])
                edges[ldx, sidx] = edg

            # get ref/variant counts in an array
            varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
            constmp = varmat[:, 0].view("S1")
            consens[ldx, :constmp.size] = constmp
            varstmp = np.nonzero(varmat[:, 1])[0]
            varpos[ldx, :varstmp.size] = varstmp 

            # impute allele information back into the read b/c muscle
            wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])

            # sort in sname (alphanumeric) order for indels storage
            for idx in wkeys:
                # seqarr and amask are in input order here
                args = (seqarr, idx, amask)
                seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
                sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
                if indidx.size:
                    indels[sidx, :indidx.size] = indidx
                wname = names[idx]
                istack.append(
                    "{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))

        # store the stack
        if istack:
            allstack.append("\n".join(istack))

    # cleanup
    proc.stdout.close()
    if proc.stderr:
        proc.stderr.close()
    proc.stdin.close()
    proc.wait()

    # write to file after
    odx = chunk.rsplit("_")[-1]
    alignfile = os.path.join(data.tmpdir, "align_{}.fa".format(odx))
    with open(alignfile, 'wt') as outfile:
        outfile.write("\n//\n//\n".join(allstack) + "\n")
    #os.remove(chunk)

    # save indels array to tmp dir
    ifile = os.path.join(data.tmpdir, "indels_{}.tmp.npy".format(odx))
    np.save(ifile, ifilter)
    dfile = os.path.join(data.tmpdir, "duples_{}.tmp.npy".format(odx))
    np.save(dfile, dfilter)


# In[456]:


def store_alleles(seqs):
    shape = (len(seqs), max([len(i) for i in seqs]))
    arrseqs = np.zeros(shape, dtype=np.bytes_)
    for row in range(arrseqs.shape[0]):
        seqsrow = seqs[row]
        arrseqs[row, :len(seqsrow)] = list(seqsrow)
    amask = np.char.islower(arrseqs)
    if np.any(amask):
        return amask, True
    else:
        return amask, False


def retrieve_indels_and_alleles(seqarr, idx, amask):
    concatarr = seqarr[idx]
    newmask = np.zeros(len(concatarr), dtype=np.bool_)                        

    # check for indels and impute to amask
    indidx = np.where(concatarr == b"-")[0]
    if np.sum(amask):
        print('yes')
        if indidx.size:

            # impute for position of variants
            allrows = np.arange(amask.shape[1])
            mask = np.ones(allrows.shape[0], dtype=np.bool_)
            for idx in indidx:
                if idx < mask.shape[0]:
                    mask[idx] = False
            not_idx = allrows[mask == 1]

            # fill in new data into all other spots
            newmask[not_idx] = amask[idx, :not_idx.shape[0]]

        else:
            newmask = amask[idx]
                
        # lower the alleles
        concatarr[newmask] = np.char.lower(concatarr[newmask])

    return concatarr, indidx


# In[462]:


np.load("4-refpairtest_across/4-refpairtest-tmpalign/indels_35.tmp.npy")


# In[457]:


align_to_array(data, samples, chunk)


# In[459]:


chunks = glob.glob("4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_*")
for chunk in chunks:
    align_to_array(data, samples, chunk)


# In[263]:


@numba.jit(nopython=True)
def refbuild(iseq, consdict):
    """ Returns the most common base at each site in order. """

    altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)

    for col in range(iseq.shape[1]):
        ## expand colums with ambigs and remove N-
        fcounts = np.zeros(111, dtype=np.int64)
        counts = np.bincount(iseq[:, col])#, minlength=90)
        fcounts[:counts.shape[0]] = counts
        ## set N and - to zero, wish numba supported minlen arg
        fcounts[78] = 0
        fcounts[45] = 0
        ## add ambig counts to true bases
        for aidx in range(consdict.shape[0]):
            nbases = fcounts[consdict[aidx, 0]]
            for _ in range(nbases):
                fcounts[consdict[aidx, 1]] += 1
                fcounts[consdict[aidx, 2]] += 1
            fcounts[consdict[aidx, 0]] = 0

        ## now get counts from the modified counts arr
        who = np.argmax(fcounts)
        altrefs[col, 0] = who
        fcounts[who] = 0

        ## if an alt allele fill over the "." placeholder
        who = np.argmax(fcounts)
        if who:
            altrefs[col, 1] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 2] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 3] = who

    return altrefs


PSEUDO_REF = np.array([
    [82, 71, 65],
    [75, 71, 84],
    [83, 71, 67],
    [89, 84, 67],
    [87, 84, 65],
    [77, 67, 65],
    [78, 78, 78],
    [45, 45, 45],
    ], dtype=np.uint8)


# In[274]:


seqarr.view("u1")


# In[132]:


aseqs = np.array([
    list('ATG--GGA'),
    list('A--GGGGA'),
    list('---GGGGA'),
    list('--------'),
])

np.where(aseqs == "-")


# In[156]:


PSEUDO_REF = np.array([
    [82, 71, 65],
    [75, 71, 84],
    [83, 71, 67],
    [89, 84, 67],
    [87, 84, 65],
    [77, 67, 65],
    [78, 78, 78],
    [45, 45, 45],
    ], dtype=np.uint8)


# In[164]:


@numba.jit(nopython=True)
def reftrick(iseq, consdict):
    """ Returns the most common base at each site in order. """

    altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)
    #altrefs[:, 1] = 46

    for col in range(iseq.shape[1]):
        ## expand colums with ambigs and remove N-
        fcounts = np.zeros(111, dtype=np.int64)
        counts = np.bincount(iseq[:, col])#, minlength=90)
        fcounts[:counts.shape[0]] = counts
        ## set N and - to zero, wish numba supported minlen arg
        fcounts[78] = 0
        fcounts[45] = 0
        ## add ambig counts to true bases
        for aidx in range(consdict.shape[0]):
            nbases = fcounts[consdict[aidx, 0]]
            for _ in range(nbases):
                fcounts[consdict[aidx, 1]] += 1
                fcounts[consdict[aidx, 2]] += 1
            fcounts[consdict[aidx, 0]] = 0

        ## now get counts from the modified counts arr
        who = np.argmax(fcounts)
        altrefs[col, 0] = who
        fcounts[who] = 0

        ## if an alt allele fill over the "." placeholder
        who = np.argmax(fcounts)
        if who:
            altrefs[col, 1] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 2] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 3] = who

    return altrefs


# In[195]:


sss = np.array([
    list("---AAAA----"),
    list("---TTAA----"),
    list("AAAAAAAAAAA"),
    list("AATAAAATAAA"),
], dtype="S1")
sss


# In[196]:


varmat = reftrick(sss.view(np.uint8), PSEUDO_REF)
varmat


# In[226]:


names


# In[202]:


get_ipython().run_cell_magic('timeit', '', 'store_alleles(seqs)\n')


# In[219]:


[b"".join(i) for i in arrseqs ]


# In[229]:


snames


# In[237]:


widx = np.argsort([i.rsplit(";", 1)[0] for i in keys])
for i in widx:
    print(names[i], b"".join(seqarr[i]))


# In[184]:


consensus = varmat[:, 0]
variants = np.nonzero(varmat[:, 1])[0]
variants


# In[148]:


seqarr == seqarr[0]


# In[145]:


for kidx in range(seqarr.shape[0]):
    pass


# In[ ]:


# In[ ]:


# In[133]:


# reorder b/c muscle doesn't keep order
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
    dalign1.keys(), 
    key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
    (len(nnames), len(dalign1[keys[0]])),
    dtype='U1',
    )
for kidx, key in enumerate(keys):
    concatseq = dalign1[key].replace("\n", '')
    seqarr[kidx] = list(dalign1[key].replace("\n", ""))

seqarr


# In[101]:


# fill in edges for each seq
edg = (
    len(concatseq) - len(concatseq.lstrip('-')),
    len(concatseq.rstrip('-')),
    len(concatseq.rstrip('-')),
    len(concatseq.rstrip('-')),
    )             
edges[ldx, kidx] = edg

edges[ldx, kidx]

concatseq[0:50] + 'nnnn'[50:50] + concatseq[50:50]


# In[22]:


seeds = [
    os.path.join(
        data.dirs.across, 
        "{}-{}.htemp".format(data.name, jobid)) for jobid in jobids
    ]
allseeds = os.path.join(data.dirs.across, "{}-x.fa".format(data.name))

cmd1 = ['cat'] + seeds
cmd2 = [ipyrad.bins.vsearch, '--sortbylength', '-', '--output', allseeds]
proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=sps.PIPE, close_fds=True)
out = proc2.communicate()
proc1.stdout.close()


# In[23]:


out


# In[12]:


data = s.data
jobid = 2
nthreads = 4


# In[14]:


# get files for this jobid
catshuf = os.path.join(
    data.dirs.across, 
    "{}-{}-catshuf.fa".format(data.name, jobid))
uhaplos = os.path.join(
    data.dirs.across, 
    "{}-{}.utemp".format(data.name, jobid))
hhaplos = os.path.join(
    data.dirs.across, 
    "{}-{}.htemp".format(data.name, jobid))
#logfile = os.path.join(data.dirs.across, "s6_cluster_stats.txt")

## parameters that vary by datatype
## (too low of cov values yield too many poor alignments)
strand = "plus"
cov = 0.75         # 0.90
if data.paramsdict["datatype"] in ["gbs", "2brad"]:
    strand = "both"
    cov = 0.60
elif data.paramsdict["datatype"] == "pairgbs":
    strand = "both"
    cov = 0.75     # 0.90

## nthreads is calculated in 'call_cluster()'
cmd = [ipyrad.bins.vsearch,
       "-cluster_smallmem", catshuf,
       "-strand", strand,
       "-query_cov", str(cov),
       "-minsl", str(0.5),
       "-id", str(data.paramsdict["clust_threshold"]),
       "-userout", uhaplos,
       "-notmatched", hhaplos,
       "-userfields", "query+target+qstrand",
       "-maxaccepts", "1",
       "-maxrejects", "0",
       "-fasta_width", "0",
       "-threads", str(nthreads),  # "0",
       "-fulldp",
       "-usersort",
       ]
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
    raise IPyradError(out)


# In[20]:


proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
    raise IPyradError(out)


# In[22]:


build_concat_files(s.data, 1, [s.data.samples[i] for i in s.cgroups[1]], 123)


# In[23]:


data = s.data
jobid = 1
samples = [s.data.samples[i] for i in s.cgroups[1]]
randomseed = 123


# In[24]:


conshandles = [
    sample.files.consens[0] for sample in samples if 
    sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"


# In[26]:


## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
    data.dirs.across, 
    "{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
    call = sps.Popen(cmd, stdout=output, close_fds=True)
    call.communicate()

## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
        "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
        "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)

## impute pseudo-haplo information to avoid mismatch at hetero sites
## the read data with hetero sites is put back into clustered data later.
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))

proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.fa")
with open(allhaps, 'w') as output:
    proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
    proc2.communicate()
proc1.stdout.close()

## now sort the file using vsearch
allsort = groupcons.replace("-catcons.gz", "-catsort.fa")
cmd1 = [ipyrad.bins.vsearch,
        "--sortbylength", allhaps,
        "--fasta_width", "0",
        "--output", allsort]
LOGGER.debug(" ".join(cmd1))
proc1 = sps.Popen(cmd1, close_fds=True)
proc1.communicate()


# In[28]:


data.dirs.across


# In[8]:


# calculate the theading of cluster1 jobs:
ncpus = len(self.ipyclient.ids)
njobs = len(self.cgroups)
nnodes = len(self.hostd)
minthreads = 2
maxthreads = 10


# In[9]:


# product
targets = [0, 1]
self.thview = self.ipyclient.load_balanced_view(targets=targets)


# In[10]:


s.prepare_concats()


# In[31]:


cluster1(data, 2, 4)


# In[30]:


def cluster1(data, jobid, nthreads):

    # get files for this jobid
    catshuf = os.path.join(
        data.dirs.across, 
        "{}-{}-catshuf.fa".format(data.name, jobid))
    uhaplos = os.path.join(
        data.dirs.across, 
        "{}-{}.utemp".format(data.name, jobid))
    hhaplos = os.path.join(
        data.dirs.across, 
        "{}-{}.htemp".format(data.name, jobid))

    ## parameters that vary by datatype
    ## (too low of cov values yield too many poor alignments)
    strand = "plus"
    cov = 0.75         # 0.90
    if data.paramsdict["datatype"] in ["gbs", "2brad"]:
        strand = "both"
        cov = 0.60
    elif data.paramsdict["datatype"] == "pairgbs":
        strand = "both"
        cov = 0.75     # 0.90

    ## nthreads is calculated in 'call_cluster()'
    cmd = [ipyrad.bins.vsearch,
           "-cluster_smallmem", catshuf,
           "-strand", strand,
           "-query_cov", str(cov),
           "-minsl", str(0.5),
           "-id", str(data.paramsdict["clust_threshold"]),
           "-userout", uhaplos,
           "-notmatched", hhaplos,
           "-userfields", "query+target+qstrand",
           "-maxaccepts", "1",
           "-maxrejects", "0",
           "-fasta_width", "0",
           "-threads", str(nthreads),  # "0",
           "-fulldp",
           "-usersort",
           ]
    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
    out = proc.communicate()
    if proc.returncode:
        raise IPyradError(out)


# In[12]:


if len(self.samples) < 50:
    nclusters = 1
elif len(self.samples) < 100:
    nclusters = 2
elif len(self.samples) < 200:
    nclusters = 4
elif len(self.samples) > 500:
    nclusters = 10
else:
    nclusters = int(len(self.samples) / 10)
    
    
# In[ ]:


# In[10]:


self.cgroups


# In[48]:


hosts


# In[ ]: