Notebook

In [1]:

from ipyrad.assemble.clustmap import *
from ipyrad.assemble.cluster_across import *
import ipyparallel as ipp

In [2]:

data = ip.Assembly("5-cyatho")
data.set_params("project_dir", "5-cyatho")
data.set_params("sorted_fastq_path", "./example_empirical_rad/*.gz")
data.set_params("filter_adapters", 2)
data.set_params("clust_threshold", .90)

New Assembly: 5-cyatho

In [17]:

data.run("12")

Assembly: 5-cyatho
[####################] 100% 0:00:12 | loading reads        | s1 |
[####################] 100% 0:04:36 | processing reads     | s2 |

In [2]:

from ipyrad.assemble.clustmap import *
ipyclient = ipp.Client()
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
samples = list(s3.data.samples.values())
sample = samples[1]

/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@oud')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-58c66cc4a814> in <module>()
      1 from ipyrad.assemble.clustmap import *
      2 ipyclient = ipp.Client()
----> 3 s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
      4 samples = list(s3.data.samples.values())
      5 sample = samples[1]

NameError: name 'data' is not defined

In [19]:

s3.run()

[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:00:30 | dereplicating        | s3 |
[####################] 100% 0:18:59 | clustering/mapping   | s3 |
[####################] 100% 0:00:18 | building clusters    | s3 |
[####################] 100% 0:00:01 | chunking clusters    | s3 |
[####################] 100% 0:19:32 | aligning clusters    | s3 |
[####################] 100% 0:00:39 | concat clusters      | s3 |

In [21]:

data.run("45")

Assembly: 5-cyatho
[####################] 100% 0:02:44 | inferring [H, E]     | s4 |
[####################] 100% 0:00:06 | calculating depths   | s5 |
[####################] 100% 0:00:13 | chunking clusters    | s5 |
[####################] 100% 0:11:47 | consens calling      | s5 |

In [3]:

data = ip.load_json("5-cyatho/5-cyatho.json")
samples = list(data.samples.values())
ipyclient = ipp.Client()

loading Assembly: 5-cyatho
from saved path: ~/Documents/ipyrad/tests/5-cyatho/5-cyatho.json

/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@oud')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)

In [4]:

from ipyrad.assemble.cluster_across import *
popmins = {'o': 0, 'c': 3, 'r': 3}
popdict = {
    'o': ['33588_przewalskii', '32082_przewalskii'],
    'c': ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides'],
    'r': [ '35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'],
}
data._link_populations(popdict, popmins)
data.populations

Out[4]:

{'o': (0, ['33588_przewalskii', '32082_przewalskii']),
 'c': (3,
  ['29154_superba',
   '30686_cyathophylla',
   '41478_cyathophylloides',
   '41954_cyathophylloides']),
 'r': (3,
  ['35236_rex',
   '35855_rex',
   '38362_rex',
   '39618_rex',
   '40578_rex',
   '30556_thamno',
   '33413_thamno'])}

In [5]:

s6 = Step6(data, samples, ipyclient, 0, 0)

In [7]:

s6.remote_build_concats_tier1()

[####################] 100% 0:00:05 | concatenating inputs | s6 |

In [8]:

s6.remote_cluster1()

[####################] 100% 0:01:43 | clustering across 1  | s6 |

In [9]:

s6.remote_build_concats_tier2()

[####################] 100% 0:00:01 | concatenating inputs | s6 |

In [10]:

s6.remote_cluster2()

[####################] 100% 0:02:27 | clustering 2 | s6 |

In [17]:

s6.remote_build_denovo_clusters()

[####################] 100% 0:00:04 | building clusters    | s6 |

In [13]:

s6.remote_align_denovo_clusters()

[####################] 100% 0:00:11 | aligning clusters    | s6 |

In [30]:

chunk = "./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760"
align_to_array(s6.data, s6.samples, chunk)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-30-6668ee2335d1> in <module>()
      1 chunk = "./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760"
----> 2 align_to_array(s6.data, s6.samples, chunk)

<ipython-input-29-a103e0eab7ba> in align_to_array(data, samples, chunk)
     86             varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
     87             constmp = varmat[:, 0].view('S1')
---> 88             consens[ldx, :constmp.size] = constmp
     89             varstmp = np.nonzero(varmat[:, 1])[0]
     90             varpos[ldx, :varstmp.size] = varstmp

ValueError: could not broadcast input array from shape (104) into shape (103)

In [29]:

def align_to_array(data, samples, chunk):

    # data are already chunked, read in the whole thing
    with open(chunk, 'rt') as infile:
        clusts = infile.read().split("//\n//\n")[:-1]    

    # snames to ensure sorted order
    samples.sort(key=lambda x: x.name)
    snames = [sample.name for sample in samples]

    # make a tmparr to store metadata (this can get huge, consider using h5)
    maxvar = 25
    maxlen = data._hackersonly["max_fragment_length"] + 20
    maxinds = data.paramsdict["max_Indels_locus"]
    ifilter = np.zeros(len(clusts), dtype=np.bool_)
    dfilter = np.zeros(len(clusts), dtype=np.bool_)
    varpos = np.zeros((len(clusts), maxvar), dtype='u1')
    indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
    edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
    consens = np.zeros((len(clusts), maxlen), dtype="S1")

    # create a persistent shell for running muscle in. 
    proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)

    # iterate over clusters until finished
    allstack = []
    for ldx in range(len(clusts)):
        istack = []
        lines = clusts[ldx].strip().split("\n")
        names = lines[::2]
        seqs = lines[1::2]
        seqs = [i[:90] for i in seqs]    
        align1 = []
        
        # find duplicates and skip aligning but keep it for downstream.
        unames = set([i.rsplit("_", 1)[0] for i in names])
        if len(unames) < len(names):
            dfilter[ldx] = True
            istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]

        else:
            # append counter to names because muscle doesn't retain order
            nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]

            # make back into strings
            cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])                

            # store allele (lowercase) info
            amask, abool = store_alleles(seqs)

            # send align1 to the bash shell (TODO: check for pipe-overflow)
            cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(cl1, ipyrad.bins.muscle, "//\n"))
            proc.stdin.write(cmd1.encode())

            # read the stdout by line until splitter is reached
            for line in iter(proc.stdout.readline, b'//\n'):
                align1.append(line.decode())

            # reorder b/c muscle doesn't keep order
            lines = "".join(align1)[1:].split("\n>")
            dalign1 = dict([i.split("\n", 1) for i in lines])
            keys = sorted(
                dalign1.keys(), 
                key=lambda x: int(x.rsplit("*")[-1])
            )
            seqarr = np.zeros(
                (len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
                dtype='S1',
                )
            for kidx, key in enumerate(keys):
                concatseq = dalign1[key].replace("\n", "")
                seqarr[kidx] = list(concatseq)
                    
                # fill in edges for each seq
                edg = (
                    len(concatseq) - len(concatseq.lstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    )
                sidx = snames.index(key.rsplit('_', 1)[0])
                edges[ldx, sidx] = edg

            # get ref/variant counts in an array
            varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
            constmp = varmat[:, 0].view('S1')
            consens[ldx, :constmp.size] = constmp
            varstmp = np.nonzero(varmat[:, 1])[0]
            varpos[ldx, :varstmp.size] = varstmp 

            # impute allele information back into the read b/c muscle
            wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])

            # sort in sname (alphanumeric) order for indels storage
            for idx in wkeys:
                # seqarr and amask are in input order here
                args = (seqarr, idx, amask)
                seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
                sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
                if indidx.size:
                    indsize = min(indidx.size, sum(maxinds))
                    indels[ldx, sidx, :indsize] = indidx[:indsize]
                wname = names[idx]

                # store (only for visually checking alignments)
                istack.append(
                    "{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))

            # indel filter
            if indels.sum(axis=1).max() >= sum(maxinds):
                ifilter[ldx] = True

        # store the stack (only for visually checking alignments)
        if istack:
            allstack.append("\n".join(istack))

    # cleanup
    proc.stdout.close()
    if proc.stderr:
        proc.stderr.close()
    proc.stdin.close()
    proc.wait()

    # write to file after (only for visually checking alignments)
    odx = chunk.rsplit("_")[-1]
    alignfile = os.path.join(data.tmpdir, "aligned_{}.fa".format(odx))
    with open(alignfile, 'wt') as outfile:
        outfile.write("\n//\n//\n".join(allstack) + "\n")
    #os.remove(chunk)

    # save indels array to tmp dir
    np.save(
        os.path.join(data.tmpdir, "ifilter_{}.tmp.npy".format(odx)),
        ifilter)
    np.save(
        os.path.join(data.tmpdir, "dfilter_{}.tmp.npy".format(odx)),
        dfilter)

In [13]:

data = ip.load_json("4-refpairtest.json")
samples = list(data.samples.values())
sample = samples[0]
force = True
ipyclient = ipp.Client()

loading Assembly: 4-refpairtest
from saved path: ~/Documents/ipyrad/tests/4-refpairtest.json

/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@oud')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)

In [3]:

data.samples

Out[3]:

{'3J_0': <ipyrad.core.sample.Sample at 0x7f498c9ff208>,
 '1D_0': <ipyrad.core.sample.Sample at 0x7f498ca03780>,
 '2H_0': <ipyrad.core.sample.Sample at 0x7f498ca03940>,
 '1B_0': <ipyrad.core.sample.Sample at 0x7f498ca03e48>,
 '2F_0': <ipyrad.core.sample.Sample at 0x7f498ca03dd8>,
 '3L_0': <ipyrad.core.sample.Sample at 0x7f498c9ad898>,
 '2E_0': <ipyrad.core.sample.Sample at 0x7f498ca03e80>,
 '3I_0': <ipyrad.core.sample.Sample at 0x7f498c9ad438>,
 '3K_0': <ipyrad.core.sample.Sample at 0x7f498c9ad390>,
 '2G_0': <ipyrad.core.sample.Sample at 0x7f498c9b87f0>,
 '1A_0': <ipyrad.core.sample.Sample at 0x7f498c9b8cf8>,
 '1C_0': <ipyrad.core.sample.Sample at 0x7f498c9b8c88>}

In [4]:

popmins = {'1': 3, '2': 3, '3': 3}
popdict = {
    '1': ['1A_0', '1B_0', '1C_0', '1D_0'], 
    '2': ['2E_0', '2F_0', '2G_0', '2H_0'],
    '3': ['3I_0', '3J_0', '3K_0', '3L_0'],
}

data._link_populations(popdict, popmins)
data.populations

Out[4]:

{'1': (3, ['1A_0', '1B_0', '1C_0', '1D_0']),
 '2': (3, ['2E_0', '2F_0', '2G_0', '2H_0']),
 '3': (3, ['3I_0', '3J_0', '3K_0', '3L_0'])}

In [5]:

s = Step6(data, samples, ipyclient, 123, True)
s.samples

Out[5]:

[<ipyrad.core.sample.Sample at 0x7f498c9ff208>,
 <ipyrad.core.sample.Sample at 0x7f498ca03780>,
 <ipyrad.core.sample.Sample at 0x7f498ca03940>,
 <ipyrad.core.sample.Sample at 0x7f498ca03e48>,
 <ipyrad.core.sample.Sample at 0x7f498ca03dd8>,
 <ipyrad.core.sample.Sample at 0x7f498c9ad898>,
 <ipyrad.core.sample.Sample at 0x7f498ca03e80>,
 <ipyrad.core.sample.Sample at 0x7f498c9ad438>,
 <ipyrad.core.sample.Sample at 0x7f498c9ad390>,
 <ipyrad.core.sample.Sample at 0x7f498c9b87f0>,
 <ipyrad.core.sample.Sample at 0x7f498c9b8cf8>,
 <ipyrad.core.sample.Sample at 0x7f498c9b8c88>]

In [6]:

s.remote_build_concats_tier1()

[####################] 100% 0:00:00 | concatenating inputs | s6 |

In [7]:

s.remote_cluster1()

[####################] 100% 0:00:02 | clustering across 1  | s6 |

In [8]:

s.remote_build_concats_tier2()

[####################] 100% 0:00:00 | concatenating inputs | s6 |

In [9]:

s.remote_cluster2()

[####################] 100% 0:00:00 | clustering 2 | s6 |

In [10]:

s.remote_build_denovo_clusters()

[####################] 100% 0:00:00 | building clusters    | s6 |

In [11]:

chunks = glob.glob('4-refpairtest_across/4-refpairtest-tmpalign/*.chunk*')
for chunk in chunks:
    align_to_array(s.data, s.samples, chunk)

In [317]:

chunk = "4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_70"
samples = s.samples
data = s.data

# data are already chunked, read in the whole thing
#with open(chunk, 'rt') as infile:
#    clusts = infile.read().split("//\n//\n")[:-1]    

In [389]:

nnames

Out[389]:

['>1A_0_7;*0',
 '>1B_0_7;*1',
 '>1D_0_7;*2',
 '>1C_0_7;*3',
 '>2E_0_7;*4',
 '>2F_0_7;*5',
 '>2H_0_7;*6',
 '>2G_0_7;*7',
 '>3I_0_7;*8',
 '>3J_0_7;*9',
 '>3L_0_7;*10',
 '>3K_0_7;*11']

In [451]:

def align_to_array(data, samples, chunk):

    # data are already chunked, read in the whole thing
    with open(chunk, 'rt') as infile:
        clusts = infile.read().split("//\n//\n")[:-1]    

    # snames to ensure sorted order
    samples.sort(key=lambda x: x.name)
    snames = [sample.name for sample in samples]

    # make a tmparr to store metadata (this can get huge, consider using h5)
    maxvar = 25
    maxlen = data._hackersonly["max_fragment_length"] + 20
    maxinds = data.paramsdict["max_Indels_locus"]
    ifilter = np.zeros(len(clusts), dtype=np.bool_)
    dfilter = np.zeros(len(clusts), dtype=np.bool_)
    varpos = np.zeros((len(clusts), maxvar), dtype='u1')
    indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
    edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
    consens = np.zeros((len(clusts), maxlen), dtype="S1")

    # create a persistent shell for running muscle in. 
    proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)

    # iterate over clusters until finished
    allstack = []
    for ldx in range(len(clusts)):
        istack = []
        lines = clusts[ldx].strip().split("\n")
        names = lines[::2]
        seqs = lines[1::2]        
        align1 = []
        
        # find duplicates and skip aligning/ordering since it will be filtered.
        unames = set([i.rsplit("_", 1)[0] for i in names])
        if len(unames) < len(names):
            dfilter[ldx] = True
            istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]

        else:
            # append counter to names because muscle doesn't retain order
            nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]

            # make back into strings
            cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])                

            # store allele (lowercase) info on seqs
            amask, abool = store_alleles(seqs)

            # send align1 to the bash shell (TODO: check for pipe-overflow)
            cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(cl1, ipyrad.bins.muscle, "//\n"))
            proc.stdin.write(cmd1.encode())

            # read the stdout by line until splitter is reached
            for line in iter(proc.stdout.readline, b'//\n'):
                align1.append(line.decode())

            # reorder into original order and arrange into a dictionary and arr
            lines = "".join(align1)[1:].split("\n>")
            dalign1 = dict([i.split("\n", 1) for i in lines])
            keys = sorted(
                dalign1.keys(), 
                key=lambda x: int(x.rsplit("*")[-1])
            )
            seqarr = np.zeros(
                (len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
                dtype='S1',
                )
            for kidx, key in enumerate(keys):
                concatseq = dalign1[key].replace("\n", "")
                seqarr[kidx] = list(concatseq)
                    
                # fill in edges for each seq
                edg = (
                    len(concatseq) - len(concatseq.lstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    len(concatseq.rstrip('-')),
                    )
                sidx = snames.index(key.rsplit('_', 1)[0])
                edges[ldx, sidx] = edg

            # get ref/variant counts in an array
            varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
            constmp = varmat[:, 0].view("S1")
            consens[ldx, :constmp.size] = constmp
            varstmp = np.nonzero(varmat[:, 1])[0]
            varpos[ldx, :varstmp.size] = varstmp 

            # impute allele information back into the read b/c muscle
            wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])

            # sort in sname (alphanumeric) order for indels storage
            for idx in wkeys:
                # seqarr and amask are in input order here
                args = (seqarr, idx, amask)
                seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
                sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
                if indidx.size:
                    indels[sidx, :indidx.size] = indidx
                wname = names[idx]
                istack.append(
                    "{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))

        # store the stack
        if istack:
            allstack.append("\n".join(istack))

    # cleanup
    proc.stdout.close()
    if proc.stderr:
        proc.stderr.close()
    proc.stdin.close()
    proc.wait()

    # write to file after
    odx = chunk.rsplit("_")[-1]
    alignfile = os.path.join(data.tmpdir, "align_{}.fa".format(odx))
    with open(alignfile, 'wt') as outfile:
        outfile.write("\n//\n//\n".join(allstack) + "\n")
    #os.remove(chunk)

    # save indels array to tmp dir
    ifile = os.path.join(data.tmpdir, "indels_{}.tmp.npy".format(odx))
    np.save(ifile, ifilter)
    dfile = os.path.join(data.tmpdir, "duples_{}.tmp.npy".format(odx))
    np.save(dfile, dfilter)

In [456]:

def store_alleles(seqs):
    shape = (len(seqs), max([len(i) for i in seqs]))
    arrseqs = np.zeros(shape, dtype=np.bytes_)
    for row in range(arrseqs.shape[0]):
        seqsrow = seqs[row]
        arrseqs[row, :len(seqsrow)] = list(seqsrow)
    amask = np.char.islower(arrseqs)
    if np.any(amask):
        return amask, True
    else:
        return amask, False



def retrieve_indels_and_alleles(seqarr, idx, amask):
    concatarr = seqarr[idx]
    newmask = np.zeros(len(concatarr), dtype=np.bool_)                        

    # check for indels and impute to amask
    indidx = np.where(concatarr == b"-")[0]
    if np.sum(amask):
        print('yes')
        if indidx.size:

            # impute for position of variants
            allrows = np.arange(amask.shape[1])
            mask = np.ones(allrows.shape[0], dtype=np.bool_)
            for idx in indidx:
                if idx < mask.shape[0]:
                    mask[idx] = False
            not_idx = allrows[mask == 1]

            # fill in new data into all other spots
            newmask[not_idx] = amask[idx, :not_idx.shape[0]]

        else:
            newmask = amask[idx]
                
        # lower the alleles
        concatarr[newmask] = np.char.lower(concatarr[newmask])

    return concatarr, indidx

In [462]:

np.load("4-refpairtest_across/4-refpairtest-tmpalign/indels_35.tmp.npy")

Out[462]:

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False])

In [457]:

align_to_array(data, samples, chunk)

In [459]:

chunks = glob.glob("4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_*")
for chunk in chunks:
    align_to_array(data, samples, chunk)

In [263]:

@numba.jit(nopython=True)
def refbuild(iseq, consdict):
    """ Returns the most common base at each site in order. """

    altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)

    for col in range(iseq.shape[1]):
        ## expand colums with ambigs and remove N-
        fcounts = np.zeros(111, dtype=np.int64)
        counts = np.bincount(iseq[:, col])#, minlength=90)
        fcounts[:counts.shape[0]] = counts
        ## set N and - to zero, wish numba supported minlen arg
        fcounts[78] = 0
        fcounts[45] = 0
        ## add ambig counts to true bases
        for aidx in range(consdict.shape[0]):
            nbases = fcounts[consdict[aidx, 0]]
            for _ in range(nbases):
                fcounts[consdict[aidx, 1]] += 1
                fcounts[consdict[aidx, 2]] += 1
            fcounts[consdict[aidx, 0]] = 0

        ## now get counts from the modified counts arr
        who = np.argmax(fcounts)
        altrefs[col, 0] = who
        fcounts[who] = 0

        ## if an alt allele fill over the "." placeholder
        who = np.argmax(fcounts)
        if who:
            altrefs[col, 1] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 2] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 3] = who

    return altrefs


PSEUDO_REF = np.array([
    [82, 71, 65],
    [75, 71, 84],
    [83, 71, 67],
    [89, 84, 67],
    [87, 84, 65],
    [77, 67, 65],
    [78, 78, 78],
    [45, 45, 45],
    ], dtype=np.uint8)

In [274]:

seqarr.view("u1")

Out[274]:

array([[84, 71, 67, ..., 67, 67, 71],
       [84, 71, 67, ..., 67, 67, 71],
       [84, 71, 67, ..., 67, 67, 71],
       ...,
       [84, 71, 67, ..., 67, 67, 71],
       [84, 71, 67, ..., 67, 67, 71],
       [84, 71, 67, ..., 67, 67, 71]], dtype=uint8)

In [132]:

aseqs = np.array([
    list('ATG--GGA'),
    list('A--GGGGA'),
    list('---GGGGA'),
    list('--------'),
])

np.where(aseqs == "-")

Out[132]:

(array([0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]),
 array([3, 4, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7]))

In [156]:

PSEUDO_REF = np.array([
    [82, 71, 65],
    [75, 71, 84],
    [83, 71, 67],
    [89, 84, 67],
    [87, 84, 65],
    [77, 67, 65],
    [78, 78, 78],
    [45, 45, 45],
    ], dtype=np.uint8)

In [164]:

@numba.jit(nopython=True)
def reftrick(iseq, consdict):
    """ Returns the most common base at each site in order. """

    altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)
    #altrefs[:, 1] = 46

    for col in range(iseq.shape[1]):
        ## expand colums with ambigs and remove N-
        fcounts = np.zeros(111, dtype=np.int64)
        counts = np.bincount(iseq[:, col])#, minlength=90)
        fcounts[:counts.shape[0]] = counts
        ## set N and - to zero, wish numba supported minlen arg
        fcounts[78] = 0
        fcounts[45] = 0
        ## add ambig counts to true bases
        for aidx in range(consdict.shape[0]):
            nbases = fcounts[consdict[aidx, 0]]
            for _ in range(nbases):
                fcounts[consdict[aidx, 1]] += 1
                fcounts[consdict[aidx, 2]] += 1
            fcounts[consdict[aidx, 0]] = 0

        ## now get counts from the modified counts arr
        who = np.argmax(fcounts)
        altrefs[col, 0] = who
        fcounts[who] = 0

        ## if an alt allele fill over the "." placeholder
        who = np.argmax(fcounts)
        if who:
            altrefs[col, 1] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 2] = who
            fcounts[who] = 0

            ## if 3rd or 4th alleles observed then add to arr
            who = np.argmax(fcounts)
            altrefs[col, 3] = who

    return altrefs

In [195]:

sss = np.array([
    list("---AAAA----"),
    list("---TTAA----"),
    list("AAAAAAAAAAA"),
    list("AATAAAATAAA"),
], dtype="S1")
sss

Out[195]:

array([[b'-', b'-', b'-', b'A', b'A', b'A', b'A', b'-', b'-', b'-', b'-'],
       [b'-', b'-', b'-', b'T', b'T', b'A', b'A', b'-', b'-', b'-', b'-'],
       [b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A'],
       [b'A', b'A', b'T', b'A', b'A', b'A', b'A', b'T', b'A', b'A', b'A']],
      dtype='|S1')

In [196]:

varmat = reftrick(sss.view(np.uint8), PSEUDO_REF)
varmat

Out[196]:

array([[65,  0,  0,  0],
       [65,  0,  0,  0],
       [65, 84,  0,  0],
       [65, 84,  0,  0],
       [65, 84,  0,  0],
       [65,  0,  0,  0],
       [65,  0,  0,  0],
       [65, 84,  0,  0],
       [65,  0,  0,  0],
       [65,  0,  0,  0],
       [65,  0,  0,  0]], dtype=uint8)

In [226]:

names

Out[226]:

['>1A_0_7',
 '>1B_0_7',
 '>1D_0_7',
 '>1C_0_7',
 '>2E_0_7',
 '>2F_0_7',
 '>2H_0_7',
 '>2G_0_7',
 '>3I_0_7',
 '>3J_0_7',
 '>3L_0_7',
 '>3K_0_7']

In [202]:

%%timeit
store_alleles(seqs)

31.3 µs ± 764 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [219]:

[b"".join(i) for i in arrseqs ]

Out[219]:

[b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA',
 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA',
 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA',
 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA']

In [229]:

snames

Out[229]:

['1A_0',
 '1B_0',
 '1C_0',
 '1D_0',
 '2E_0',
 '2F_0',
 '2G_0',
 '2H_0',
 '3I_0',
 '3J_0',
 '3K_0',
 '3L_0']

In [237]:

widx = np.argsort([i.rsplit(";", 1)[0] for i in keys])
for i in widx:
    print(names[i], b"".join(seqarr[i]))

>1A_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>1B_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>1C_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>1D_0_7 b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA'
>2E_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>2F_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>2G_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>2H_0_7 b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA'
>3I_0_7 b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>3J_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>3K_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
>3L_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'

In [184]:

consensus = varmat[:, 0]
variants = np.nonzero(varmat[:, 1])[0]
variants

Out[184]:

array([2, 3, 4, 7])

In [148]:

seqarr == seqarr[0]

Out[148]:

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
        False,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True, False,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, False,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True, False,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True, False,  True,
         True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True, False,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True]])

In [145]:

for kidx in range(seqarr.shape[0]):
    pass

In [ ]:

In [133]:

# reorder b/c muscle doesn't keep order
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
    dalign1.keys(), 
    key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
    (len(nnames), len(dalign1[keys[0]])),
    dtype='U1',
    )
for kidx, key in enumerate(keys):
    concatseq = dalign1[key].replace("\n", '')
    seqarr[kidx] = list(dalign1[key].replace("\n", ""))

seqarr

Out[133]:

array([['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'A', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'W', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'G', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'T', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'Y', 'C', 'T', 'T', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'],
       ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A',
        'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C',
        'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T',
        'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A']],
      dtype='<U1')

In [101]:

# fill in edges for each seq
edg = (
    len(concatseq) - len(concatseq.lstrip('-')),
    len(concatseq.rstrip('-')),
    len(concatseq.rstrip('-')),
    len(concatseq.rstrip('-')),
    )             
edges[ldx, kidx] = edg

edges[ldx, kidx]

concatseq[0:50] + 'nnnn'[50:50] + concatseq[50:50]

Out[101]:

'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'

In [22]:

    seeds = [
        os.path.join(
            data.dirs.across, 
            "{}-{}.htemp".format(data.name, jobid)) for jobid in jobids
        ]
    allseeds = os.path.join(data.dirs.across, "{}-x.fa".format(data.name))

    cmd1 = ['cat'] + seeds
    cmd2 = [ipyrad.bins.vsearch, '--sortbylength', '-', '--output', allseeds]
    proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
    proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=sps.PIPE, close_fds=True)
    out = proc2.communicate()
    proc1.stdout.close()

In [23]:

out

Out[23]:

(b'', None)

In [12]:

data = s.data
jobid = 2
nthreads = 4

In [14]:

    # get files for this jobid
    catshuf = os.path.join(
        data.dirs.across, 
        "{}-{}-catshuf.fa".format(data.name, jobid))
    uhaplos = os.path.join(
        data.dirs.across, 
        "{}-{}.utemp".format(data.name, jobid))
    hhaplos = os.path.join(
        data.dirs.across, 
        "{}-{}.htemp".format(data.name, jobid))
    #logfile = os.path.join(data.dirs.across, "s6_cluster_stats.txt")

    ## parameters that vary by datatype
    ## (too low of cov values yield too many poor alignments)
    strand = "plus"
    cov = 0.75         # 0.90
    if data.paramsdict["datatype"] in ["gbs", "2brad"]:
        strand = "both"
        cov = 0.60
    elif data.paramsdict["datatype"] == "pairgbs":
        strand = "both"
        cov = 0.75     # 0.90

    ## nthreads is calculated in 'call_cluster()'
    cmd = [ipyrad.bins.vsearch,
           "-cluster_smallmem", catshuf,
           "-strand", strand,
           "-query_cov", str(cov),
           "-minsl", str(0.5),
           "-id", str(data.paramsdict["clust_threshold"]),
           "-userout", uhaplos,
           "-notmatched", hhaplos,
           "-userfields", "query+target+qstrand",
           "-maxaccepts", "1",
           "-maxrejects", "0",
           "-fasta_width", "0",
           "-threads", str(nthreads),  # "0",
           "-fulldp",
           "-usersort",
           ]
    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
    out = proc.communicate()
    if proc.returncode:
        raise IPyradError(out)

In [20]:

proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
    raise IPyradError(out)

In [22]:

build_concat_files(s.data, 1, [s.data.samples[i] for i in s.cgroups[1]], 123)

In [23]:

data = s.data
jobid = 1
samples = [s.data.samples[i] for i in s.cgroups[1]]
randomseed = 123

In [24]:

conshandles = [
    sample.files.consens[0] for sample in samples if 
    sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"

In [26]:

    ## concatenate all of the gzipped consens files
    cmd = ['cat'] + conshandles
    groupcons = os.path.join(
        data.dirs.across, 
        "{}-{}-catcons.gz".format(data.name, jobid))
    LOGGER.debug(" ".join(cmd))
    with open(groupcons, 'w') as output:
        call = sps.Popen(cmd, stdout=output, close_fds=True)
        call.communicate()

    ## a string of sed substitutions for temporarily replacing hetero sites
    ## skips lines with '>', so it doesn't affect taxon names
    subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
            "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
            "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
    subs = ";".join(subs)

    ## impute pseudo-haplo information to avoid mismatch at hetero sites
    ## the read data with hetero sites is put back into clustered data later.
    ## pipe passed data from gunzip to sed.
    cmd1 = ["gunzip", "-c", groupcons]
    cmd2 = ["sed", subs]
    LOGGER.debug(" ".join(cmd1))
    LOGGER.debug(" ".join(cmd2))

    proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
    allhaps = groupcons.replace("-catcons.gz", "-cathaps.fa")
    with open(allhaps, 'w') as output:
        proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
        proc2.communicate()
    proc1.stdout.close()

    ## now sort the file using vsearch
    allsort = groupcons.replace("-catcons.gz", "-catsort.fa")
    cmd1 = [ipyrad.bins.vsearch,
            "--sortbylength", allhaps,
            "--fasta_width", "0",
            "--output", allsort]
    LOGGER.debug(" ".join(cmd1))
    proc1 = sps.Popen(cmd1, close_fds=True)
    proc1.communicate()

Out[26]:

(None, None)

In [28]:

data.dirs.across

Out[28]:

''

In [8]:

# calculate the theading of cluster1 jobs:
ncpus = len(self.ipyclient.ids)
njobs = len(self.cgroups)
nnodes = len(self.hostd)
minthreads = 2
maxthreads = 10

In [9]:

# product
targets = [0, 1]
self.thview = self.ipyclient.load_balanced_view(targets=targets)

In [10]:

s.prepare_concats()

[####################] 100% 0:00:02 | concatenating inputs | s6 |

In [31]:

cluster1(data, 2, 4)

---------------------------------------------------------------------------
IPyradError                               Traceback (most recent call last)
<ipython-input-31-38c961b35fd2> in <module>()
----> 1 cluster1(data, 2, 4)

<ipython-input-30-0187c2dc7daf> in cluster1(data, jobid, nthreads)
     45     out = proc.communicate()
     46     if proc.returncode:
---> 47         raise IPyradError(out)

IPyradError: (b'vsearch v2.8.0_linux_x86_64, 15.5GB RAM, 4 cores\nhttps://github.com/torognes/vsearch\n\n\n\nUnable to open file for reading (/home/deren/Documents/ipyrad/tests/4-refpairtest_across/4-refpairtest-2-catshuf.fa)\n', None)

In [30]:

def cluster1(data, jobid, nthreads):

    # get files for this jobid
    catshuf = os.path.join(
        data.dirs.across, 
        "{}-{}-catshuf.fa".format(data.name, jobid))
    uhaplos = os.path.join(
        data.dirs.across, 
        "{}-{}.utemp".format(data.name, jobid))
    hhaplos = os.path.join(
        data.dirs.across, 
        "{}-{}.htemp".format(data.name, jobid))

    ## parameters that vary by datatype
    ## (too low of cov values yield too many poor alignments)
    strand = "plus"
    cov = 0.75         # 0.90
    if data.paramsdict["datatype"] in ["gbs", "2brad"]:
        strand = "both"
        cov = 0.60
    elif data.paramsdict["datatype"] == "pairgbs":
        strand = "both"
        cov = 0.75     # 0.90

    ## nthreads is calculated in 'call_cluster()'
    cmd = [ipyrad.bins.vsearch,
           "-cluster_smallmem", catshuf,
           "-strand", strand,
           "-query_cov", str(cov),
           "-minsl", str(0.5),
           "-id", str(data.paramsdict["clust_threshold"]),
           "-userout", uhaplos,
           "-notmatched", hhaplos,
           "-userfields", "query+target+qstrand",
           "-maxaccepts", "1",
           "-maxrejects", "0",
           "-fasta_width", "0",
           "-threads", str(nthreads),  # "0",
           "-fulldp",
           "-usersort",
           ]
    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
    out = proc.communicate()
    if proc.returncode:
        raise IPyradError(out)

In [12]:

if len(self.samples) < 50:
    nclusters = 1
elif len(self.samples) < 100:
    nclusters = 2
elif len(self.samples) < 200:
    nclusters = 4
elif len(self.samples) > 500:
    nclusters = 10
else:
    nclusters = int(len(self.samples) / 10)
    
    

In [ ]:

In [10]:

self.cgroups

Out[10]:

{0: ['1A_0', '1B_0', '1C_0', '1D_0'],
 1: ['2E_0', '2F_0', '2G_0', '2H_0'],
 2: ['3I_0', '3J_0', '3K_0', '3L_0']}

In [48]:

hosts

Out[48]:

{'oud': [0, 1, 2, 3]}

In [ ]: