from ipyrad.assemble.clustmap import *
from ipyrad.assemble.cluster_across import *
import ipyparallel as ipp
data = ip.Assembly("5-cyatho")
data.set_params("project_dir", "5-cyatho")
data.set_params("sorted_fastq_path", "./example_empirical_rad/*.gz")
data.set_params("filter_adapters", 2)
data.set_params("clust_threshold", .90)
New Assembly: 5-cyatho
data.run("12")
Assembly: 5-cyatho [####################] 100% 0:00:12 | loading reads | s1 | [####################] 100% 0:04:36 | processing reads | s2 |
from ipyrad.assemble.clustmap import *
ipyclient = ipp.Client()
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
samples = list(s3.data.samples.values())
sample = samples[1]
/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: Controller appears to be listening on localhost, but not on this machine. If this is true, you should specify Client(...,sshserver='you@oud') or instruct your controller to listen on an external IP. RuntimeWarning)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-58c66cc4a814> in <module>() 1 from ipyrad.assemble.clustmap import * 2 ipyclient = ipp.Client() ----> 3 s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient) 4 samples = list(s3.data.samples.values()) 5 sample = samples[1] NameError: name 'data' is not defined
s3.run()
[####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:00:30 | dereplicating | s3 | [####################] 100% 0:18:59 | clustering/mapping | s3 | [####################] 100% 0:00:18 | building clusters | s3 | [####################] 100% 0:00:01 | chunking clusters | s3 | [####################] 100% 0:19:32 | aligning clusters | s3 | [####################] 100% 0:00:39 | concat clusters | s3 |
data.run("45")
Assembly: 5-cyatho [####################] 100% 0:02:44 | inferring [H, E] | s4 | [####################] 100% 0:00:06 | calculating depths | s5 | [####################] 100% 0:00:13 | chunking clusters | s5 | [####################] 100% 0:11:47 | consens calling | s5 |
data = ip.load_json("5-cyatho/5-cyatho.json")
samples = list(data.samples.values())
ipyclient = ipp.Client()
loading Assembly: 5-cyatho from saved path: ~/Documents/ipyrad/tests/5-cyatho/5-cyatho.json
/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: Controller appears to be listening on localhost, but not on this machine. If this is true, you should specify Client(...,sshserver='you@oud') or instruct your controller to listen on an external IP. RuntimeWarning)
from ipyrad.assemble.cluster_across import *
popmins = {'o': 0, 'c': 3, 'r': 3}
popdict = {
'o': ['33588_przewalskii', '32082_przewalskii'],
'c': ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides'],
'r': [ '35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'],
}
data._link_populations(popdict, popmins)
data.populations
{'o': (0, ['33588_przewalskii', '32082_przewalskii']), 'c': (3, ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides']), 'r': (3, ['35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'])}
s6 = Step6(data, samples, ipyclient, 0, 0)
s6.remote_build_concats_tier1()
[####################] 100% 0:00:05 | concatenating inputs | s6 |
s6.remote_cluster1()
[####################] 100% 0:01:43 | clustering across 1 | s6 |
s6.remote_build_concats_tier2()
[####################] 100% 0:00:01 | concatenating inputs | s6 |
s6.remote_cluster2()
[####################] 100% 0:02:27 | clustering 2 | s6 |
s6.remote_build_denovo_clusters()
[####################] 100% 0:00:04 | building clusters | s6 |
s6.remote_align_denovo_clusters()
[####################] 100% 0:00:11 | aligning clusters | s6 |
chunk = "./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760"
align_to_array(s6.data, s6.samples, chunk)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-30-6668ee2335d1> in <module>() 1 chunk = "./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760" ----> 2 align_to_array(s6.data, s6.samples, chunk) <ipython-input-29-a103e0eab7ba> in align_to_array(data, samples, chunk) 86 varmat = refbuild(seqarr.view('u1'), PSEUDO_REF) 87 constmp = varmat[:, 0].view('S1') ---> 88 consens[ldx, :constmp.size] = constmp 89 varstmp = np.nonzero(varmat[:, 1])[0] 90 varpos[ldx, :varstmp.size] = varstmp ValueError: could not broadcast input array from shape (104) into shape (103)
def align_to_array(data, samples, chunk):
# data are already chunked, read in the whole thing
with open(chunk, 'rt') as infile:
clusts = infile.read().split("//\n//\n")[:-1]
# snames to ensure sorted order
samples.sort(key=lambda x: x.name)
snames = [sample.name for sample in samples]
# make a tmparr to store metadata (this can get huge, consider using h5)
maxvar = 25
maxlen = data._hackersonly["max_fragment_length"] + 20
maxinds = data.paramsdict["max_Indels_locus"]
ifilter = np.zeros(len(clusts), dtype=np.bool_)
dfilter = np.zeros(len(clusts), dtype=np.bool_)
varpos = np.zeros((len(clusts), maxvar), dtype='u1')
indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
consens = np.zeros((len(clusts), maxlen), dtype="S1")
# create a persistent shell for running muscle in.
proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)
# iterate over clusters until finished
allstack = []
for ldx in range(len(clusts)):
istack = []
lines = clusts[ldx].strip().split("\n")
names = lines[::2]
seqs = lines[1::2]
seqs = [i[:90] for i in seqs]
align1 = []
# find duplicates and skip aligning but keep it for downstream.
unames = set([i.rsplit("_", 1)[0] for i in names])
if len(unames) < len(names):
dfilter[ldx] = True
istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]
else:
# append counter to names because muscle doesn't retain order
nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]
# make back into strings
cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])
# store allele (lowercase) info
amask, abool = store_alleles(seqs)
# send align1 to the bash shell (TODO: check for pipe-overflow)
cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(cl1, ipyrad.bins.muscle, "//\n"))
proc.stdin.write(cmd1.encode())
# read the stdout by line until splitter is reached
for line in iter(proc.stdout.readline, b'//\n'):
align1.append(line.decode())
# reorder b/c muscle doesn't keep order
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
dalign1.keys(),
key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
(len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
dtype='S1',
)
for kidx, key in enumerate(keys):
concatseq = dalign1[key].replace("\n", "")
seqarr[kidx] = list(concatseq)
# fill in edges for each seq
edg = (
len(concatseq) - len(concatseq.lstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
)
sidx = snames.index(key.rsplit('_', 1)[0])
edges[ldx, sidx] = edg
# get ref/variant counts in an array
varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
constmp = varmat[:, 0].view('S1')
consens[ldx, :constmp.size] = constmp
varstmp = np.nonzero(varmat[:, 1])[0]
varpos[ldx, :varstmp.size] = varstmp
# impute allele information back into the read b/c muscle
wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])
# sort in sname (alphanumeric) order for indels storage
for idx in wkeys:
# seqarr and amask are in input order here
args = (seqarr, idx, amask)
seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
if indidx.size:
indsize = min(indidx.size, sum(maxinds))
indels[ldx, sidx, :indsize] = indidx[:indsize]
wname = names[idx]
# store (only for visually checking alignments)
istack.append(
"{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))
# indel filter
if indels.sum(axis=1).max() >= sum(maxinds):
ifilter[ldx] = True
# store the stack (only for visually checking alignments)
if istack:
allstack.append("\n".join(istack))
# cleanup
proc.stdout.close()
if proc.stderr:
proc.stderr.close()
proc.stdin.close()
proc.wait()
# write to file after (only for visually checking alignments)
odx = chunk.rsplit("_")[-1]
alignfile = os.path.join(data.tmpdir, "aligned_{}.fa".format(odx))
with open(alignfile, 'wt') as outfile:
outfile.write("\n//\n//\n".join(allstack) + "\n")
#os.remove(chunk)
# save indels array to tmp dir
np.save(
os.path.join(data.tmpdir, "ifilter_{}.tmp.npy".format(odx)),
ifilter)
np.save(
os.path.join(data.tmpdir, "dfilter_{}.tmp.npy".format(odx)),
dfilter)
data = ip.load_json("4-refpairtest.json")
samples = list(data.samples.values())
sample = samples[0]
force = True
ipyclient = ipp.Client()
loading Assembly: 4-refpairtest from saved path: ~/Documents/ipyrad/tests/4-refpairtest.json
/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: Controller appears to be listening on localhost, but not on this machine. If this is true, you should specify Client(...,sshserver='you@oud') or instruct your controller to listen on an external IP. RuntimeWarning)
data.samples
{'3J_0': <ipyrad.core.sample.Sample at 0x7f498c9ff208>, '1D_0': <ipyrad.core.sample.Sample at 0x7f498ca03780>, '2H_0': <ipyrad.core.sample.Sample at 0x7f498ca03940>, '1B_0': <ipyrad.core.sample.Sample at 0x7f498ca03e48>, '2F_0': <ipyrad.core.sample.Sample at 0x7f498ca03dd8>, '3L_0': <ipyrad.core.sample.Sample at 0x7f498c9ad898>, '2E_0': <ipyrad.core.sample.Sample at 0x7f498ca03e80>, '3I_0': <ipyrad.core.sample.Sample at 0x7f498c9ad438>, '3K_0': <ipyrad.core.sample.Sample at 0x7f498c9ad390>, '2G_0': <ipyrad.core.sample.Sample at 0x7f498c9b87f0>, '1A_0': <ipyrad.core.sample.Sample at 0x7f498c9b8cf8>, '1C_0': <ipyrad.core.sample.Sample at 0x7f498c9b8c88>}
popmins = {'1': 3, '2': 3, '3': 3}
popdict = {
'1': ['1A_0', '1B_0', '1C_0', '1D_0'],
'2': ['2E_0', '2F_0', '2G_0', '2H_0'],
'3': ['3I_0', '3J_0', '3K_0', '3L_0'],
}
data._link_populations(popdict, popmins)
data.populations
{'1': (3, ['1A_0', '1B_0', '1C_0', '1D_0']), '2': (3, ['2E_0', '2F_0', '2G_0', '2H_0']), '3': (3, ['3I_0', '3J_0', '3K_0', '3L_0'])}
s = Step6(data, samples, ipyclient, 123, True)
s.samples
[<ipyrad.core.sample.Sample at 0x7f498c9ff208>, <ipyrad.core.sample.Sample at 0x7f498ca03780>, <ipyrad.core.sample.Sample at 0x7f498ca03940>, <ipyrad.core.sample.Sample at 0x7f498ca03e48>, <ipyrad.core.sample.Sample at 0x7f498ca03dd8>, <ipyrad.core.sample.Sample at 0x7f498c9ad898>, <ipyrad.core.sample.Sample at 0x7f498ca03e80>, <ipyrad.core.sample.Sample at 0x7f498c9ad438>, <ipyrad.core.sample.Sample at 0x7f498c9ad390>, <ipyrad.core.sample.Sample at 0x7f498c9b87f0>, <ipyrad.core.sample.Sample at 0x7f498c9b8cf8>, <ipyrad.core.sample.Sample at 0x7f498c9b8c88>]
s.remote_build_concats_tier1()
[####################] 100% 0:00:00 | concatenating inputs | s6 |
s.remote_cluster1()
[####################] 100% 0:00:02 | clustering across 1 | s6 |
s.remote_build_concats_tier2()
[####################] 100% 0:00:00 | concatenating inputs | s6 |
s.remote_cluster2()
[####################] 100% 0:00:00 | clustering 2 | s6 |
s.remote_build_denovo_clusters()
[####################] 100% 0:00:00 | building clusters | s6 |
chunks = glob.glob('4-refpairtest_across/4-refpairtest-tmpalign/*.chunk*')
for chunk in chunks:
align_to_array(s.data, s.samples, chunk)
chunk = "4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_70"
samples = s.samples
data = s.data
# data are already chunked, read in the whole thing
#with open(chunk, 'rt') as infile:
# clusts = infile.read().split("//\n//\n")[:-1]
nnames
['>1A_0_7;*0', '>1B_0_7;*1', '>1D_0_7;*2', '>1C_0_7;*3', '>2E_0_7;*4', '>2F_0_7;*5', '>2H_0_7;*6', '>2G_0_7;*7', '>3I_0_7;*8', '>3J_0_7;*9', '>3L_0_7;*10', '>3K_0_7;*11']
def align_to_array(data, samples, chunk):
# data are already chunked, read in the whole thing
with open(chunk, 'rt') as infile:
clusts = infile.read().split("//\n//\n")[:-1]
# snames to ensure sorted order
samples.sort(key=lambda x: x.name)
snames = [sample.name for sample in samples]
# make a tmparr to store metadata (this can get huge, consider using h5)
maxvar = 25
maxlen = data._hackersonly["max_fragment_length"] + 20
maxinds = data.paramsdict["max_Indels_locus"]
ifilter = np.zeros(len(clusts), dtype=np.bool_)
dfilter = np.zeros(len(clusts), dtype=np.bool_)
varpos = np.zeros((len(clusts), maxvar), dtype='u1')
indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1')
edges = np.zeros((len(clusts), len(samples), 4), dtype='u1')
consens = np.zeros((len(clusts), maxlen), dtype="S1")
# create a persistent shell for running muscle in.
proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0)
# iterate over clusters until finished
allstack = []
for ldx in range(len(clusts)):
istack = []
lines = clusts[ldx].strip().split("\n")
names = lines[::2]
seqs = lines[1::2]
align1 = []
# find duplicates and skip aligning/ordering since it will be filtered.
unames = set([i.rsplit("_", 1)[0] for i in names])
if len(unames) < len(names):
dfilter[ldx] = True
istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)]
else:
# append counter to names because muscle doesn't retain order
nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)]
# make back into strings
cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)])
# store allele (lowercase) info on seqs
amask, abool = store_alleles(seqs)
# send align1 to the bash shell (TODO: check for pipe-overflow)
cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
.format(cl1, ipyrad.bins.muscle, "//\n"))
proc.stdin.write(cmd1.encode())
# read the stdout by line until splitter is reached
for line in iter(proc.stdout.readline, b'//\n'):
align1.append(line.decode())
# reorder into original order and arrange into a dictionary and arr
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
dalign1.keys(),
key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
(len(nnames), len(dalign1[keys[0]].replace("\n", ""))),
dtype='S1',
)
for kidx, key in enumerate(keys):
concatseq = dalign1[key].replace("\n", "")
seqarr[kidx] = list(concatseq)
# fill in edges for each seq
edg = (
len(concatseq) - len(concatseq.lstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
)
sidx = snames.index(key.rsplit('_', 1)[0])
edges[ldx, sidx] = edg
# get ref/variant counts in an array
varmat = refbuild(seqarr.view('u1'), PSEUDO_REF)
constmp = varmat[:, 0].view("S1")
consens[ldx, :constmp.size] = constmp
varstmp = np.nonzero(varmat[:, 1])[0]
varpos[ldx, :varstmp.size] = varstmp
# impute allele information back into the read b/c muscle
wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys])
# sort in sname (alphanumeric) order for indels storage
for idx in wkeys:
# seqarr and amask are in input order here
args = (seqarr, idx, amask)
seqarr[idx], indidx = retrieve_indels_and_alleles(*args)
sidx = snames.index(names[idx].rsplit("_", 1)[0][1:])
if indidx.size:
indels[sidx, :indidx.size] = indidx
wname = names[idx]
istack.append(
"{}\n{}".format(wname, b"".join(seqarr[idx]).decode()))
# store the stack
if istack:
allstack.append("\n".join(istack))
# cleanup
proc.stdout.close()
if proc.stderr:
proc.stderr.close()
proc.stdin.close()
proc.wait()
# write to file after
odx = chunk.rsplit("_")[-1]
alignfile = os.path.join(data.tmpdir, "align_{}.fa".format(odx))
with open(alignfile, 'wt') as outfile:
outfile.write("\n//\n//\n".join(allstack) + "\n")
#os.remove(chunk)
# save indels array to tmp dir
ifile = os.path.join(data.tmpdir, "indels_{}.tmp.npy".format(odx))
np.save(ifile, ifilter)
dfile = os.path.join(data.tmpdir, "duples_{}.tmp.npy".format(odx))
np.save(dfile, dfilter)
def store_alleles(seqs):
shape = (len(seqs), max([len(i) for i in seqs]))
arrseqs = np.zeros(shape, dtype=np.bytes_)
for row in range(arrseqs.shape[0]):
seqsrow = seqs[row]
arrseqs[row, :len(seqsrow)] = list(seqsrow)
amask = np.char.islower(arrseqs)
if np.any(amask):
return amask, True
else:
return amask, False
def retrieve_indels_and_alleles(seqarr, idx, amask):
concatarr = seqarr[idx]
newmask = np.zeros(len(concatarr), dtype=np.bool_)
# check for indels and impute to amask
indidx = np.where(concatarr == b"-")[0]
if np.sum(amask):
print('yes')
if indidx.size:
# impute for position of variants
allrows = np.arange(amask.shape[1])
mask = np.ones(allrows.shape[0], dtype=np.bool_)
for idx in indidx:
if idx < mask.shape[0]:
mask[idx] = False
not_idx = allrows[mask == 1]
# fill in new data into all other spots
newmask[not_idx] = amask[idx, :not_idx.shape[0]]
else:
newmask = amask[idx]
# lower the alleles
concatarr[newmask] = np.char.lower(concatarr[newmask])
return concatarr, indidx
np.load("4-refpairtest_across/4-refpairtest-tmpalign/indels_35.tmp.npy")
array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False])
align_to_array(data, samples, chunk)
chunks = glob.glob("4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_*")
for chunk in chunks:
align_to_array(data, samples, chunk)
@numba.jit(nopython=True)
def refbuild(iseq, consdict):
""" Returns the most common base at each site in order. """
altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)
for col in range(iseq.shape[1]):
## expand colums with ambigs and remove N-
fcounts = np.zeros(111, dtype=np.int64)
counts = np.bincount(iseq[:, col])#, minlength=90)
fcounts[:counts.shape[0]] = counts
## set N and - to zero, wish numba supported minlen arg
fcounts[78] = 0
fcounts[45] = 0
## add ambig counts to true bases
for aidx in range(consdict.shape[0]):
nbases = fcounts[consdict[aidx, 0]]
for _ in range(nbases):
fcounts[consdict[aidx, 1]] += 1
fcounts[consdict[aidx, 2]] += 1
fcounts[consdict[aidx, 0]] = 0
## now get counts from the modified counts arr
who = np.argmax(fcounts)
altrefs[col, 0] = who
fcounts[who] = 0
## if an alt allele fill over the "." placeholder
who = np.argmax(fcounts)
if who:
altrefs[col, 1] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 2] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 3] = who
return altrefs
PSEUDO_REF = np.array([
[82, 71, 65],
[75, 71, 84],
[83, 71, 67],
[89, 84, 67],
[87, 84, 65],
[77, 67, 65],
[78, 78, 78],
[45, 45, 45],
], dtype=np.uint8)
seqarr.view("u1")
array([[84, 71, 67, ..., 67, 67, 71], [84, 71, 67, ..., 67, 67, 71], [84, 71, 67, ..., 67, 67, 71], ..., [84, 71, 67, ..., 67, 67, 71], [84, 71, 67, ..., 67, 67, 71], [84, 71, 67, ..., 67, 67, 71]], dtype=uint8)
aseqs = np.array([
list('ATG--GGA'),
list('A--GGGGA'),
list('---GGGGA'),
list('--------'),
])
np.where(aseqs == "-")
(array([0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3]), array([3, 4, 1, 2, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 7]))
PSEUDO_REF = np.array([
[82, 71, 65],
[75, 71, 84],
[83, 71, 67],
[89, 84, 67],
[87, 84, 65],
[77, 67, 65],
[78, 78, 78],
[45, 45, 45],
], dtype=np.uint8)
@numba.jit(nopython=True)
def reftrick(iseq, consdict):
""" Returns the most common base at each site in order. """
altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8)
#altrefs[:, 1] = 46
for col in range(iseq.shape[1]):
## expand colums with ambigs and remove N-
fcounts = np.zeros(111, dtype=np.int64)
counts = np.bincount(iseq[:, col])#, minlength=90)
fcounts[:counts.shape[0]] = counts
## set N and - to zero, wish numba supported minlen arg
fcounts[78] = 0
fcounts[45] = 0
## add ambig counts to true bases
for aidx in range(consdict.shape[0]):
nbases = fcounts[consdict[aidx, 0]]
for _ in range(nbases):
fcounts[consdict[aidx, 1]] += 1
fcounts[consdict[aidx, 2]] += 1
fcounts[consdict[aidx, 0]] = 0
## now get counts from the modified counts arr
who = np.argmax(fcounts)
altrefs[col, 0] = who
fcounts[who] = 0
## if an alt allele fill over the "." placeholder
who = np.argmax(fcounts)
if who:
altrefs[col, 1] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 2] = who
fcounts[who] = 0
## if 3rd or 4th alleles observed then add to arr
who = np.argmax(fcounts)
altrefs[col, 3] = who
return altrefs
sss = np.array([
list("---AAAA----"),
list("---TTAA----"),
list("AAAAAAAAAAA"),
list("AATAAAATAAA"),
], dtype="S1")
sss
array([[b'-', b'-', b'-', b'A', b'A', b'A', b'A', b'-', b'-', b'-', b'-'], [b'-', b'-', b'-', b'T', b'T', b'A', b'A', b'-', b'-', b'-', b'-'], [b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A'], [b'A', b'A', b'T', b'A', b'A', b'A', b'A', b'T', b'A', b'A', b'A']], dtype='|S1')
varmat = reftrick(sss.view(np.uint8), PSEUDO_REF)
varmat
array([[65, 0, 0, 0], [65, 0, 0, 0], [65, 84, 0, 0], [65, 84, 0, 0], [65, 84, 0, 0], [65, 0, 0, 0], [65, 0, 0, 0], [65, 84, 0, 0], [65, 0, 0, 0], [65, 0, 0, 0], [65, 0, 0, 0]], dtype=uint8)
names
['>1A_0_7', '>1B_0_7', '>1D_0_7', '>1C_0_7', '>2E_0_7', '>2F_0_7', '>2H_0_7', '>2G_0_7', '>3I_0_7', '>3J_0_7', '>3L_0_7', '>3K_0_7']
%%timeit
store_alleles(seqs)
31.3 µs ± 764 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
[b"".join(i) for i in arrseqs ]
[b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA', b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA', b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA', b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA']
snames
['1A_0', '1B_0', '1C_0', '1D_0', '2E_0', '2F_0', '2G_0', '2H_0', '3I_0', '3J_0', '3K_0', '3L_0']
widx = np.argsort([i.rsplit(";", 1)[0] for i in keys])
for i in widx:
print(names[i], b"".join(seqarr[i]))
>1A_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >1B_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >1C_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >1D_0_7 b'TGCAGCTTCTGAAGCCTTAAATCCTCACGTCAACATGATGCCTWCATGAA' >2E_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >2F_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >2G_0_7 b'TGCAGCTTCTGAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >2H_0_7 b'TGCAGCTTCTGAAGCCTGCAATCCTCACGTCAATATGATGCCTACATGAA' >3I_0_7 b'TGCAGCTYCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >3J_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >3K_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA' >3L_0_7 b'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
consensus = varmat[:, 0]
variants = np.nonzero(varmat[:, 1])[0]
variants
array([2, 3, 4, 7])
seqarr == seqarr[0]
array([[ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, False, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True], [ True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]])
for kidx in range(seqarr.shape[0]):
pass
# reorder b/c muscle doesn't keep order
lines = "".join(align1)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines])
keys = sorted(
dalign1.keys(),
key=lambda x: int(x.rsplit("*")[-1])
)
seqarr = np.zeros(
(len(nnames), len(dalign1[keys[0]])),
dtype='U1',
)
for kidx, key in enumerate(keys):
concatseq = dalign1[key].replace("\n", '')
seqarr[kidx] = list(dalign1[key].replace("\n", ""))
seqarr
array([['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'A', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'W', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'G', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'T', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'G', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'Y', 'C', 'T', 'T', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A'], ['T', 'G', 'C', 'A', 'G', 'C', 'T', 'T', 'C', 'T', 'T', 'A', 'A', 'G', 'C', 'C', 'T', 'T', 'C', 'A', 'A', 'T', 'C', 'C', 'T', 'C', 'A', 'C', 'G', 'T', 'C', 'A', 'A', 'C', 'A', 'T', 'G', 'A', 'T', 'G', 'C', 'C', 'T', 'A', 'C', 'A', 'T', 'G', 'A', 'A']], dtype='<U1')
# fill in edges for each seq
edg = (
len(concatseq) - len(concatseq.lstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
len(concatseq.rstrip('-')),
)
edges[ldx, kidx] = edg
edges[ldx, kidx]
concatseq[0:50] + 'nnnn'[50:50] + concatseq[50:50]
'TGCAGCTTCTTAAGCCTTCAATCCTCACGTCAACATGATGCCTACATGAA'
seeds = [
os.path.join(
data.dirs.across,
"{}-{}.htemp".format(data.name, jobid)) for jobid in jobids
]
allseeds = os.path.join(data.dirs.across, "{}-x.fa".format(data.name))
cmd1 = ['cat'] + seeds
cmd2 = [ipyrad.bins.vsearch, '--sortbylength', '-', '--output', allseeds]
proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=sps.PIPE, close_fds=True)
out = proc2.communicate()
proc1.stdout.close()
out
(b'', None)
data = s.data
jobid = 2
nthreads = 4
# get files for this jobid
catshuf = os.path.join(
data.dirs.across,
"{}-{}-catshuf.fa".format(data.name, jobid))
uhaplos = os.path.join(
data.dirs.across,
"{}-{}.utemp".format(data.name, jobid))
hhaplos = os.path.join(
data.dirs.across,
"{}-{}.htemp".format(data.name, jobid))
#logfile = os.path.join(data.dirs.across, "s6_cluster_stats.txt")
## parameters that vary by datatype
## (too low of cov values yield too many poor alignments)
strand = "plus"
cov = 0.75 # 0.90
if data.paramsdict["datatype"] in ["gbs", "2brad"]:
strand = "both"
cov = 0.60
elif data.paramsdict["datatype"] == "pairgbs":
strand = "both"
cov = 0.75 # 0.90
## nthreads is calculated in 'call_cluster()'
cmd = [ipyrad.bins.vsearch,
"-cluster_smallmem", catshuf,
"-strand", strand,
"-query_cov", str(cov),
"-minsl", str(0.5),
"-id", str(data.paramsdict["clust_threshold"]),
"-userout", uhaplos,
"-notmatched", hhaplos,
"-userfields", "query+target+qstrand",
"-maxaccepts", "1",
"-maxrejects", "0",
"-fasta_width", "0",
"-threads", str(nthreads), # "0",
"-fulldp",
"-usersort",
]
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
raise IPyradError(out)
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
raise IPyradError(out)
build_concat_files(s.data, 1, [s.data.samples[i] for i in s.cgroups[1]], 123)
data = s.data
jobid = 1
samples = [s.data.samples[i] for i in s.cgroups[1]]
randomseed = 123
conshandles = [
sample.files.consens[0] for sample in samples if
sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"
## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
data.dirs.across,
"{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
call = sps.Popen(cmd, stdout=output, close_fds=True)
call.communicate()
## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
"/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
"/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)
## impute pseudo-haplo information to avoid mismatch at hetero sites
## the read data with hetero sites is put back into clustered data later.
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))
proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.fa")
with open(allhaps, 'w') as output:
proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
proc2.communicate()
proc1.stdout.close()
## now sort the file using vsearch
allsort = groupcons.replace("-catcons.gz", "-catsort.fa")
cmd1 = [ipyrad.bins.vsearch,
"--sortbylength", allhaps,
"--fasta_width", "0",
"--output", allsort]
LOGGER.debug(" ".join(cmd1))
proc1 = sps.Popen(cmd1, close_fds=True)
proc1.communicate()
(None, None)
data.dirs.across
''
# calculate the theading of cluster1 jobs:
ncpus = len(self.ipyclient.ids)
njobs = len(self.cgroups)
nnodes = len(self.hostd)
minthreads = 2
maxthreads = 10
# product
targets = [0, 1]
self.thview = self.ipyclient.load_balanced_view(targets=targets)
s.prepare_concats()
[####################] 100% 0:00:02 | concatenating inputs | s6 |
cluster1(data, 2, 4)
--------------------------------------------------------------------------- IPyradError Traceback (most recent call last) <ipython-input-31-38c961b35fd2> in <module>() ----> 1 cluster1(data, 2, 4) <ipython-input-30-0187c2dc7daf> in cluster1(data, jobid, nthreads) 45 out = proc.communicate() 46 if proc.returncode: ---> 47 raise IPyradError(out) IPyradError: (b'vsearch v2.8.0_linux_x86_64, 15.5GB RAM, 4 cores\nhttps://github.com/torognes/vsearch\n\n\n\nUnable to open file for reading (/home/deren/Documents/ipyrad/tests/4-refpairtest_across/4-refpairtest-2-catshuf.fa)\n', None)
def cluster1(data, jobid, nthreads):
# get files for this jobid
catshuf = os.path.join(
data.dirs.across,
"{}-{}-catshuf.fa".format(data.name, jobid))
uhaplos = os.path.join(
data.dirs.across,
"{}-{}.utemp".format(data.name, jobid))
hhaplos = os.path.join(
data.dirs.across,
"{}-{}.htemp".format(data.name, jobid))
## parameters that vary by datatype
## (too low of cov values yield too many poor alignments)
strand = "plus"
cov = 0.75 # 0.90
if data.paramsdict["datatype"] in ["gbs", "2brad"]:
strand = "both"
cov = 0.60
elif data.paramsdict["datatype"] == "pairgbs":
strand = "both"
cov = 0.75 # 0.90
## nthreads is calculated in 'call_cluster()'
cmd = [ipyrad.bins.vsearch,
"-cluster_smallmem", catshuf,
"-strand", strand,
"-query_cov", str(cov),
"-minsl", str(0.5),
"-id", str(data.paramsdict["clust_threshold"]),
"-userout", uhaplos,
"-notmatched", hhaplos,
"-userfields", "query+target+qstrand",
"-maxaccepts", "1",
"-maxrejects", "0",
"-fasta_width", "0",
"-threads", str(nthreads), # "0",
"-fulldp",
"-usersort",
]
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE)
out = proc.communicate()
if proc.returncode:
raise IPyradError(out)
if len(self.samples) < 50:
nclusters = 1
elif len(self.samples) < 100:
nclusters = 2
elif len(self.samples) < 200:
nclusters = 4
elif len(self.samples) > 500:
nclusters = 10
else:
nclusters = int(len(self.samples) / 10)
self.cgroups
{0: ['1A_0', '1B_0', '1C_0', '1D_0'], 1: ['2E_0', '2F_0', '2G_0', '2H_0'], 2: ['3I_0', '3J_0', '3K_0', '3L_0']}
hosts
{'oud': [0, 1, 2, 3]}