#!/usr/bin/env python # coding: utf-8 # In[1]: from ipyrad.assemble.clustmap import * from ipyrad.assemble.cluster_across import * import ipyparallel as ipp # In[2]: data = ip.Assembly("5-cyatho") data.set_params("project_dir", "5-cyatho") data.set_params("sorted_fastq_path", "./example_empirical_rad/*.gz") data.set_params("filter_adapters", 2) data.set_params("clust_threshold", .90) # In[17]: data.run("12") # In[2]: from ipyrad.assemble.clustmap import * ipyclient = ipp.Client() s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient) samples = list(s3.data.samples.values()) sample = samples[1] # In[19]: s3.run() # In[21]: data.run("45") # In[3]: data = ip.load_json("5-cyatho/5-cyatho.json") samples = list(data.samples.values()) ipyclient = ipp.Client() # In[4]: from ipyrad.assemble.cluster_across import * popmins = {'o': 0, 'c': 3, 'r': 3} popdict = { 'o': ['33588_przewalskii', '32082_przewalskii'], 'c': ['29154_superba', '30686_cyathophylla', '41478_cyathophylloides', '41954_cyathophylloides'], 'r': [ '35236_rex', '35855_rex', '38362_rex', '39618_rex', '40578_rex', '30556_thamno', '33413_thamno'], } data._link_populations(popdict, popmins) data.populations # In[5]: s6 = Step6(data, samples, ipyclient, 0, 0) # In[7]: s6.remote_build_concats_tier1() # In[8]: s6.remote_cluster1() # In[9]: s6.remote_build_concats_tier2() # In[10]: s6.remote_cluster2() # In[17]: s6.remote_build_denovo_clusters() # In[13]: s6.remote_align_denovo_clusters() # In[30]: chunk = "./5-cyatho/5-cyatho_across/5-cyatho-tmpalign/5-cyatho.chunk_11760" align_to_array(s6.data, s6.samples, chunk) # In[29]: def align_to_array(data, samples, chunk): # data are already chunked, read in the whole thing with open(chunk, 'rt') as infile: clusts = infile.read().split("//\n//\n")[:-1] # snames to ensure sorted order samples.sort(key=lambda x: x.name) snames = [sample.name for sample in samples] # make a tmparr to store metadata (this can get huge, consider using h5) maxvar = 25 maxlen = data._hackersonly["max_fragment_length"] + 20 maxinds = data.paramsdict["max_Indels_locus"] ifilter = np.zeros(len(clusts), dtype=np.bool_) dfilter = np.zeros(len(clusts), dtype=np.bool_) varpos = np.zeros((len(clusts), maxvar), dtype='u1') indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1') edges = np.zeros((len(clusts), len(samples), 4), dtype='u1') consens = np.zeros((len(clusts), maxlen), dtype="S1") # create a persistent shell for running muscle in. proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0) # iterate over clusters until finished allstack = [] for ldx in range(len(clusts)): istack = [] lines = clusts[ldx].strip().split("\n") names = lines[::2] seqs = lines[1::2] seqs = [i[:90] for i in seqs] align1 = [] # find duplicates and skip aligning but keep it for downstream. unames = set([i.rsplit("_", 1)[0] for i in names]) if len(unames) < len(names): dfilter[ldx] = True istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)] else: # append counter to names because muscle doesn't retain order nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)] # make back into strings cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)]) # store allele (lowercase) info amask, abool = store_alleles(seqs) # send align1 to the bash shell (TODO: check for pipe-overflow) cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(cl1, ipyrad.bins.muscle, "//\n")) proc.stdin.write(cmd1.encode()) # read the stdout by line until splitter is reached for line in iter(proc.stdout.readline, b'//\n'): align1.append(line.decode()) # reorder b/c muscle doesn't keep order lines = "".join(align1)[1:].split("\n>") dalign1 = dict([i.split("\n", 1) for i in lines]) keys = sorted( dalign1.keys(), key=lambda x: int(x.rsplit("*")[-1]) ) seqarr = np.zeros( (len(nnames), len(dalign1[keys[0]].replace("\n", ""))), dtype='S1', ) for kidx, key in enumerate(keys): concatseq = dalign1[key].replace("\n", "") seqarr[kidx] = list(concatseq) # fill in edges for each seq edg = ( len(concatseq) - len(concatseq.lstrip('-')), len(concatseq.rstrip('-')), len(concatseq.rstrip('-')), len(concatseq.rstrip('-')), ) sidx = snames.index(key.rsplit('_', 1)[0]) edges[ldx, sidx] = edg # get ref/variant counts in an array varmat = refbuild(seqarr.view('u1'), PSEUDO_REF) constmp = varmat[:, 0].view('S1') consens[ldx, :constmp.size] = constmp varstmp = np.nonzero(varmat[:, 1])[0] varpos[ldx, :varstmp.size] = varstmp # impute allele information back into the read b/c muscle wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys]) # sort in sname (alphanumeric) order for indels storage for idx in wkeys: # seqarr and amask are in input order here args = (seqarr, idx, amask) seqarr[idx], indidx = retrieve_indels_and_alleles(*args) sidx = snames.index(names[idx].rsplit("_", 1)[0][1:]) if indidx.size: indsize = min(indidx.size, sum(maxinds)) indels[ldx, sidx, :indsize] = indidx[:indsize] wname = names[idx] # store (only for visually checking alignments) istack.append( "{}\n{}".format(wname, b"".join(seqarr[idx]).decode())) # indel filter if indels.sum(axis=1).max() >= sum(maxinds): ifilter[ldx] = True # store the stack (only for visually checking alignments) if istack: allstack.append("\n".join(istack)) # cleanup proc.stdout.close() if proc.stderr: proc.stderr.close() proc.stdin.close() proc.wait() # write to file after (only for visually checking alignments) odx = chunk.rsplit("_")[-1] alignfile = os.path.join(data.tmpdir, "aligned_{}.fa".format(odx)) with open(alignfile, 'wt') as outfile: outfile.write("\n//\n//\n".join(allstack) + "\n") #os.remove(chunk) # save indels array to tmp dir np.save( os.path.join(data.tmpdir, "ifilter_{}.tmp.npy".format(odx)), ifilter) np.save( os.path.join(data.tmpdir, "dfilter_{}.tmp.npy".format(odx)), dfilter) # In[13]: data = ip.load_json("4-refpairtest.json") samples = list(data.samples.values()) sample = samples[0] force = True ipyclient = ipp.Client() # In[3]: data.samples # In[4]: popmins = {'1': 3, '2': 3, '3': 3} popdict = { '1': ['1A_0', '1B_0', '1C_0', '1D_0'], '2': ['2E_0', '2F_0', '2G_0', '2H_0'], '3': ['3I_0', '3J_0', '3K_0', '3L_0'], } data._link_populations(popdict, popmins) data.populations # In[5]: s = Step6(data, samples, ipyclient, 123, True) s.samples # In[6]: s.remote_build_concats_tier1() # In[7]: s.remote_cluster1() # In[8]: s.remote_build_concats_tier2() # In[9]: s.remote_cluster2() # In[10]: s.remote_build_denovo_clusters() # In[11]: chunks = glob.glob('4-refpairtest_across/4-refpairtest-tmpalign/*.chunk*') for chunk in chunks: align_to_array(s.data, s.samples, chunk) # In[317]: chunk = "4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_70" samples = s.samples data = s.data # data are already chunked, read in the whole thing #with open(chunk, 'rt') as infile: # clusts = infile.read().split("//\n//\n")[:-1] # In[389]: nnames # In[451]: def align_to_array(data, samples, chunk): # data are already chunked, read in the whole thing with open(chunk, 'rt') as infile: clusts = infile.read().split("//\n//\n")[:-1] # snames to ensure sorted order samples.sort(key=lambda x: x.name) snames = [sample.name for sample in samples] # make a tmparr to store metadata (this can get huge, consider using h5) maxvar = 25 maxlen = data._hackersonly["max_fragment_length"] + 20 maxinds = data.paramsdict["max_Indels_locus"] ifilter = np.zeros(len(clusts), dtype=np.bool_) dfilter = np.zeros(len(clusts), dtype=np.bool_) varpos = np.zeros((len(clusts), maxvar), dtype='u1') indels = np.zeros((len(clusts), len(samples), sum(maxinds)), dtype='u1') edges = np.zeros((len(clusts), len(samples), 4), dtype='u1') consens = np.zeros((len(clusts), maxlen), dtype="S1") # create a persistent shell for running muscle in. proc = sps.Popen(["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0) # iterate over clusters until finished allstack = [] for ldx in range(len(clusts)): istack = [] lines = clusts[ldx].strip().split("\n") names = lines[::2] seqs = lines[1::2] align1 = [] # find duplicates and skip aligning/ordering since it will be filtered. unames = set([i.rsplit("_", 1)[0] for i in names]) if len(unames) < len(names): dfilter[ldx] = True istack = ["{}\n{}".format(i[1:], j) for i, j in zip(names, seqs)] else: # append counter to names because muscle doesn't retain order nnames = [">{};*{}".format(j[1:], i) for i, j in enumerate(names)] # make back into strings cl1 = "\n".join(["\n".join(i) for i in zip(nnames, seqs)]) # store allele (lowercase) info on seqs amask, abool = store_alleles(seqs) # send align1 to the bash shell (TODO: check for pipe-overflow) cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(cl1, ipyrad.bins.muscle, "//\n")) proc.stdin.write(cmd1.encode()) # read the stdout by line until splitter is reached for line in iter(proc.stdout.readline, b'//\n'): align1.append(line.decode()) # reorder into original order and arrange into a dictionary and arr lines = "".join(align1)[1:].split("\n>") dalign1 = dict([i.split("\n", 1) for i in lines]) keys = sorted( dalign1.keys(), key=lambda x: int(x.rsplit("*")[-1]) ) seqarr = np.zeros( (len(nnames), len(dalign1[keys[0]].replace("\n", ""))), dtype='S1', ) for kidx, key in enumerate(keys): concatseq = dalign1[key].replace("\n", "") seqarr[kidx] = list(concatseq) # fill in edges for each seq edg = ( len(concatseq) - len(concatseq.lstrip('-')), len(concatseq.rstrip('-')), len(concatseq.rstrip('-')), len(concatseq.rstrip('-')), ) sidx = snames.index(key.rsplit('_', 1)[0]) edges[ldx, sidx] = edg # get ref/variant counts in an array varmat = refbuild(seqarr.view('u1'), PSEUDO_REF) constmp = varmat[:, 0].view("S1") consens[ldx, :constmp.size] = constmp varstmp = np.nonzero(varmat[:, 1])[0] varpos[ldx, :varstmp.size] = varstmp # impute allele information back into the read b/c muscle wkeys = np.argsort([i.rsplit("_", 1)[0] for i in keys]) # sort in sname (alphanumeric) order for indels storage for idx in wkeys: # seqarr and amask are in input order here args = (seqarr, idx, amask) seqarr[idx], indidx = retrieve_indels_and_alleles(*args) sidx = snames.index(names[idx].rsplit("_", 1)[0][1:]) if indidx.size: indels[sidx, :indidx.size] = indidx wname = names[idx] istack.append( "{}\n{}".format(wname, b"".join(seqarr[idx]).decode())) # store the stack if istack: allstack.append("\n".join(istack)) # cleanup proc.stdout.close() if proc.stderr: proc.stderr.close() proc.stdin.close() proc.wait() # write to file after odx = chunk.rsplit("_")[-1] alignfile = os.path.join(data.tmpdir, "align_{}.fa".format(odx)) with open(alignfile, 'wt') as outfile: outfile.write("\n//\n//\n".join(allstack) + "\n") #os.remove(chunk) # save indels array to tmp dir ifile = os.path.join(data.tmpdir, "indels_{}.tmp.npy".format(odx)) np.save(ifile, ifilter) dfile = os.path.join(data.tmpdir, "duples_{}.tmp.npy".format(odx)) np.save(dfile, dfilter) # In[456]: def store_alleles(seqs): shape = (len(seqs), max([len(i) for i in seqs])) arrseqs = np.zeros(shape, dtype=np.bytes_) for row in range(arrseqs.shape[0]): seqsrow = seqs[row] arrseqs[row, :len(seqsrow)] = list(seqsrow) amask = np.char.islower(arrseqs) if np.any(amask): return amask, True else: return amask, False def retrieve_indels_and_alleles(seqarr, idx, amask): concatarr = seqarr[idx] newmask = np.zeros(len(concatarr), dtype=np.bool_) # check for indels and impute to amask indidx = np.where(concatarr == b"-")[0] if np.sum(amask): print('yes') if indidx.size: # impute for position of variants allrows = np.arange(amask.shape[1]) mask = np.ones(allrows.shape[0], dtype=np.bool_) for idx in indidx: if idx < mask.shape[0]: mask[idx] = False not_idx = allrows[mask == 1] # fill in new data into all other spots newmask[not_idx] = amask[idx, :not_idx.shape[0]] else: newmask = amask[idx] # lower the alleles concatarr[newmask] = np.char.lower(concatarr[newmask]) return concatarr, indidx # In[462]: np.load("4-refpairtest_across/4-refpairtest-tmpalign/indels_35.tmp.npy") # In[457]: align_to_array(data, samples, chunk) # In[459]: chunks = glob.glob("4-refpairtest_across/4-refpairtest-tmpalign/4-refpairtest.chunk_*") for chunk in chunks: align_to_array(data, samples, chunk) # In[263]: @numba.jit(nopython=True) def refbuild(iseq, consdict): """ Returns the most common base at each site in order. """ altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8) for col in range(iseq.shape[1]): ## expand colums with ambigs and remove N- fcounts = np.zeros(111, dtype=np.int64) counts = np.bincount(iseq[:, col])#, minlength=90) fcounts[:counts.shape[0]] = counts ## set N and - to zero, wish numba supported minlen arg fcounts[78] = 0 fcounts[45] = 0 ## add ambig counts to true bases for aidx in range(consdict.shape[0]): nbases = fcounts[consdict[aidx, 0]] for _ in range(nbases): fcounts[consdict[aidx, 1]] += 1 fcounts[consdict[aidx, 2]] += 1 fcounts[consdict[aidx, 0]] = 0 ## now get counts from the modified counts arr who = np.argmax(fcounts) altrefs[col, 0] = who fcounts[who] = 0 ## if an alt allele fill over the "." placeholder who = np.argmax(fcounts) if who: altrefs[col, 1] = who fcounts[who] = 0 ## if 3rd or 4th alleles observed then add to arr who = np.argmax(fcounts) altrefs[col, 2] = who fcounts[who] = 0 ## if 3rd or 4th alleles observed then add to arr who = np.argmax(fcounts) altrefs[col, 3] = who return altrefs PSEUDO_REF = np.array([ [82, 71, 65], [75, 71, 84], [83, 71, 67], [89, 84, 67], [87, 84, 65], [77, 67, 65], [78, 78, 78], [45, 45, 45], ], dtype=np.uint8) # In[274]: seqarr.view("u1") # In[132]: aseqs = np.array([ list('ATG--GGA'), list('A--GGGGA'), list('---GGGGA'), list('--------'), ]) np.where(aseqs == "-") # In[156]: PSEUDO_REF = np.array([ [82, 71, 65], [75, 71, 84], [83, 71, 67], [89, 84, 67], [87, 84, 65], [77, 67, 65], [78, 78, 78], [45, 45, 45], ], dtype=np.uint8) # In[164]: @numba.jit(nopython=True) def reftrick(iseq, consdict): """ Returns the most common base at each site in order. """ altrefs = np.zeros((iseq.shape[1], 4), dtype=np.uint8) #altrefs[:, 1] = 46 for col in range(iseq.shape[1]): ## expand colums with ambigs and remove N- fcounts = np.zeros(111, dtype=np.int64) counts = np.bincount(iseq[:, col])#, minlength=90) fcounts[:counts.shape[0]] = counts ## set N and - to zero, wish numba supported minlen arg fcounts[78] = 0 fcounts[45] = 0 ## add ambig counts to true bases for aidx in range(consdict.shape[0]): nbases = fcounts[consdict[aidx, 0]] for _ in range(nbases): fcounts[consdict[aidx, 1]] += 1 fcounts[consdict[aidx, 2]] += 1 fcounts[consdict[aidx, 0]] = 0 ## now get counts from the modified counts arr who = np.argmax(fcounts) altrefs[col, 0] = who fcounts[who] = 0 ## if an alt allele fill over the "." placeholder who = np.argmax(fcounts) if who: altrefs[col, 1] = who fcounts[who] = 0 ## if 3rd or 4th alleles observed then add to arr who = np.argmax(fcounts) altrefs[col, 2] = who fcounts[who] = 0 ## if 3rd or 4th alleles observed then add to arr who = np.argmax(fcounts) altrefs[col, 3] = who return altrefs # In[195]: sss = np.array([ list("---AAAA----"), list("---TTAA----"), list("AAAAAAAAAAA"), list("AATAAAATAAA"), ], dtype="S1") sss # In[196]: varmat = reftrick(sss.view(np.uint8), PSEUDO_REF) varmat # In[226]: names # In[202]: get_ipython().run_cell_magic('timeit', '', 'store_alleles(seqs)\n') # In[219]: [b"".join(i) for i in arrseqs ] # In[229]: snames # In[237]: widx = np.argsort([i.rsplit(";", 1)[0] for i in keys]) for i in widx: print(names[i], b"".join(seqarr[i])) # In[184]: consensus = varmat[:, 0] variants = np.nonzero(varmat[:, 1])[0] variants # In[148]: seqarr == seqarr[0] # In[145]: for kidx in range(seqarr.shape[0]): pass # In[ ]: # In[ ]: # In[133]: # reorder b/c muscle doesn't keep order lines = "".join(align1)[1:].split("\n>") dalign1 = dict([i.split("\n", 1) for i in lines]) keys = sorted( dalign1.keys(), key=lambda x: int(x.rsplit("*")[-1]) ) seqarr = np.zeros( (len(nnames), len(dalign1[keys[0]])), dtype='U1', ) for kidx, key in enumerate(keys): concatseq = dalign1[key].replace("\n", '') seqarr[kidx] = list(dalign1[key].replace("\n", "")) seqarr # In[101]: # fill in edges for each seq edg = ( len(concatseq) - len(concatseq.lstrip('-')), len(concatseq.rstrip('-')), len(concatseq.rstrip('-')), len(concatseq.rstrip('-')), ) edges[ldx, kidx] = edg edges[ldx, kidx] concatseq[0:50] + 'nnnn'[50:50] + concatseq[50:50] # In[22]: seeds = [ os.path.join( data.dirs.across, "{}-{}.htemp".format(data.name, jobid)) for jobid in jobids ] allseeds = os.path.join(data.dirs.across, "{}-x.fa".format(data.name)) cmd1 = ['cat'] + seeds cmd2 = [ipyrad.bins.vsearch, '--sortbylength', '-', '--output', allseeds] proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True) proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=sps.PIPE, close_fds=True) out = proc2.communicate() proc1.stdout.close() # In[23]: out # In[12]: data = s.data jobid = 2 nthreads = 4 # In[14]: # get files for this jobid catshuf = os.path.join( data.dirs.across, "{}-{}-catshuf.fa".format(data.name, jobid)) uhaplos = os.path.join( data.dirs.across, "{}-{}.utemp".format(data.name, jobid)) hhaplos = os.path.join( data.dirs.across, "{}-{}.htemp".format(data.name, jobid)) #logfile = os.path.join(data.dirs.across, "s6_cluster_stats.txt") ## parameters that vary by datatype ## (too low of cov values yield too many poor alignments) strand = "plus" cov = 0.75 # 0.90 if data.paramsdict["datatype"] in ["gbs", "2brad"]: strand = "both" cov = 0.60 elif data.paramsdict["datatype"] == "pairgbs": strand = "both" cov = 0.75 # 0.90 ## nthreads is calculated in 'call_cluster()' cmd = [ipyrad.bins.vsearch, "-cluster_smallmem", catshuf, "-strand", strand, "-query_cov", str(cov), "-minsl", str(0.5), "-id", str(data.paramsdict["clust_threshold"]), "-userout", uhaplos, "-notmatched", hhaplos, "-userfields", "query+target+qstrand", "-maxaccepts", "1", "-maxrejects", "0", "-fasta_width", "0", "-threads", str(nthreads), # "0", "-fulldp", "-usersort", ] proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE) out = proc.communicate() if proc.returncode: raise IPyradError(out) # In[20]: proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE) out = proc.communicate() if proc.returncode: raise IPyradError(out) # In[22]: build_concat_files(s.data, 1, [s.data.samples[i] for i in s.cgroups[1]], 123) # In[23]: data = s.data jobid = 1 samples = [s.data.samples[i] for i in s.cgroups[1]] randomseed = 123 # In[24]: conshandles = [ sample.files.consens[0] for sample in samples if sample.stats.reads_consens] conshandles.sort() assert conshandles, "no consensus files found" # In[26]: ## concatenate all of the gzipped consens files cmd = ['cat'] + conshandles groupcons = os.path.join( data.dirs.across, "{}-{}-catcons.gz".format(data.name, jobid)) LOGGER.debug(" ".join(cmd)) with open(groupcons, 'w') as output: call = sps.Popen(cmd, stdout=output, close_fds=True) call.communicate() ## a string of sed substitutions for temporarily replacing hetero sites ## skips lines with '>', so it doesn't affect taxon names subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g", "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g", "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"] subs = ";".join(subs) ## impute pseudo-haplo information to avoid mismatch at hetero sites ## the read data with hetero sites is put back into clustered data later. ## pipe passed data from gunzip to sed. cmd1 = ["gunzip", "-c", groupcons] cmd2 = ["sed", subs] LOGGER.debug(" ".join(cmd1)) LOGGER.debug(" ".join(cmd2)) proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True) allhaps = groupcons.replace("-catcons.gz", "-cathaps.fa") with open(allhaps, 'w') as output: proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True) proc2.communicate() proc1.stdout.close() ## now sort the file using vsearch allsort = groupcons.replace("-catcons.gz", "-catsort.fa") cmd1 = [ipyrad.bins.vsearch, "--sortbylength", allhaps, "--fasta_width", "0", "--output", allsort] LOGGER.debug(" ".join(cmd1)) proc1 = sps.Popen(cmd1, close_fds=True) proc1.communicate() # In[28]: data.dirs.across # In[8]: # calculate the theading of cluster1 jobs: ncpus = len(self.ipyclient.ids) njobs = len(self.cgroups) nnodes = len(self.hostd) minthreads = 2 maxthreads = 10 # In[9]: # product targets = [0, 1] self.thview = self.ipyclient.load_balanced_view(targets=targets) # In[10]: s.prepare_concats() # In[31]: cluster1(data, 2, 4) # In[30]: def cluster1(data, jobid, nthreads): # get files for this jobid catshuf = os.path.join( data.dirs.across, "{}-{}-catshuf.fa".format(data.name, jobid)) uhaplos = os.path.join( data.dirs.across, "{}-{}.utemp".format(data.name, jobid)) hhaplos = os.path.join( data.dirs.across, "{}-{}.htemp".format(data.name, jobid)) ## parameters that vary by datatype ## (too low of cov values yield too many poor alignments) strand = "plus" cov = 0.75 # 0.90 if data.paramsdict["datatype"] in ["gbs", "2brad"]: strand = "both" cov = 0.60 elif data.paramsdict["datatype"] == "pairgbs": strand = "both" cov = 0.75 # 0.90 ## nthreads is calculated in 'call_cluster()' cmd = [ipyrad.bins.vsearch, "-cluster_smallmem", catshuf, "-strand", strand, "-query_cov", str(cov), "-minsl", str(0.5), "-id", str(data.paramsdict["clust_threshold"]), "-userout", uhaplos, "-notmatched", hhaplos, "-userfields", "query+target+qstrand", "-maxaccepts", "1", "-maxrejects", "0", "-fasta_width", "0", "-threads", str(nthreads), # "0", "-fulldp", "-usersort", ] proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE) out = proc.communicate() if proc.returncode: raise IPyradError(out) # In[12]: if len(self.samples) < 50: nclusters = 1 elif len(self.samples) < 100: nclusters = 2 elif len(self.samples) < 200: nclusters = 4 elif len(self.samples) > 500: nclusters = 10 else: nclusters = int(len(self.samples) / 10) # In[ ]: # In[10]: self.cgroups # In[48]: hosts # In[ ]: