#!/usr/bin/env python # coding: utf-8 # In[6]: import ipyrad as ip import ipyparallel as ipp # In[7]: ipyclient = ipp.Client() # In[3]: data = ip.Assembly("1-pairtest") data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz") data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt") data.set_params("datatype", "pairddrad") data.set_params("restriction_overhang", ("TGCAG", "CGG")) # In[3]: data = ip.Assembly("2-setest") data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz") data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt") data.set_params("datatype", "rad") data.set_params("restriction_overhang", ("TGCAG", "")) # In[3]: data = ip.Assembly("3-refsetest") data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz") data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt") data.set_params("datatype", "rad") data.set_params("assembly_method", "reference") data.set_params("reference_sequence", "ipsimdata/rad_example_genome.fa") data.set_params("restriction_overhang", ("TGCAG", "")) #data.get_params() # In[5]: data = ip.Assembly("4-refpairtest") data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz") data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt") data.set_params("datatype", "pairddrad") data.set_params("assembly_method", "reference") data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa") data.set_params("restriction_overhang", ("TGCAG", "CGG")) #data.get_params() # In[8]: data = ip.Assembly("5-tortas") data.set_params("project_dir", "tortas") data.set_params("sorted_fastq_path", "/home/deren/Dropbox/Maud/fastq-concats/*.gz") data.set_params("datatype", "pairddrad") data.set_params("assembly_method", "reference") data.set_params("reference_sequence", "/home/deren/Dropbox/Maud/lgeorge.genome.fa") data.set_params("restriction_overhang", ("CATG", "AATT")) data.set_params("filter_adapters", 2) # In[4]: data.run("12", force=True) # In[11]: #data.run("12", force=True) #data = ip.load_json("1-pairtest.json") #data = ip.load_json("2-setest.json") #data = ip.load_json("3-refsetest.json") #data = ip.load_json("4-refpairtest.json") data = ip.load_json("tortas/5-tortas.json") # In[12]: # In[9]: from ipyrad.assemble.cluster_across import * # In[10]: from ipyrad.assemble.clustmap import * s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient) samples = list(s3.data.samples.values()) sample = samples[1] # In[7]: s3.data.ipcluster["threads"] = 4 s3.run() # In[11]: s3.data.run("45") # In[16]: #s3.data.stats data = s3.data jobid = 0 samples = list(data.samples.values())[:4] randomseed = 123 # In[ ]: # In[19]: conshandles = [ sample.files.consens[0] for sample in samples if sample.stats.reads_consens] conshandles.sort() assert conshandles, "no consensus files found" conshandles # In[20]: ## concatenate all of the gzipped consens files cmd = ['cat'] + conshandles groupcons = os.path.join( data.dirs.across, "{}-{}-catcons.gz".format(data.name, jobid)) LOGGER.debug(" ".join(cmd)) with open(groupcons, 'w') as output: call = sps.Popen(cmd, stdout=output, close_fds=True) call.communicate() ## a string of sed substitutions for temporarily replacing hetero sites ## skips lines with '>', so it doesn't affect taxon names subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g", "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g", "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"] subs = ";".join(subs) # In[27]: ## pipe passed data from gunzip to sed. cmd1 = ["gunzip", "-c", groupcons] cmd2 = ["sed", subs] LOGGER.debug(" ".join(cmd1)) LOGGER.debug(" ".join(cmd2)) proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True) allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz") with open(allhaps, 'w') as output: proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True) proc2.communicate() proc1.stdout.close() # In[30]: data.dirs.across = os.path.join(data.name + "_across") if not os.path.exists(data.dirs.across): os.makedirs(data.dirs.across) import ipyrad # In[31]: conshandles = [ sample.files.consens[0] for sample in samples if sample.stats.reads_consens] conshandles.sort() assert conshandles, "no consensus files found" ## concatenate all of the gzipped consens files cmd = ['cat'] + conshandles groupcons = os.path.join( data.dirs.across, "{}-{}-catcons.gz".format(data.name, jobid)) LOGGER.debug(" ".join(cmd)) with open(groupcons, 'w') as output: call = sps.Popen(cmd, stdout=output, close_fds=True) call.communicate() ## a string of sed substitutions for temporarily replacing hetero sites ## skips lines with '>', so it doesn't affect taxon names subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g", "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g", "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"] subs = ";".join(subs) ## impute pseudo-haplo information to avoid mismatch at hetero sites ## the read data with hetero sites is put back into clustered data later. ## pipe passed data from gunzip to sed. cmd1 = ["gunzip", "-c", groupcons] cmd2 = ["sed", subs] LOGGER.debug(" ".join(cmd1)) LOGGER.debug(" ".join(cmd2)) proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True) allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz") with open(allhaps, 'w') as output: proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True) proc2.communicate() proc1.stdout.close() ## now sort the file using vsearch allsort = groupcons.replace("-catcons.gz", "-catsort.fa") cmd1 = [ipyrad.bins.vsearch, "--sortbylength", allhaps, "--fasta_width", "0", "--output", allsort] LOGGER.debug(" ".join(cmd1)) proc1 = sps.Popen(cmd1, close_fds=True) proc1.communicate() # In[34]: from ipyrad.assemble.cluster_across import * random.seed(randomseed) ## open an iterator to lengthsorted file and grab two lines at at time allshuf = groupcons.replace("-catcons.gz", "-catshuf.fa") outdat = open(allshuf, 'wt') indat = open(allsort, 'r') idat = izip(iter(indat), iter(indat)) done = 0 chunk = [next(idat)] while not done: ## grab 2-lines until they become shorter (unless there's only one) oldlen = len(chunk[-1][-1]) while 1: try: dat = next(idat) except StopIteration: done = 1 break if len(dat[-1]) == oldlen: chunk.append(dat) else: ## send the last chunk off to be processed random.shuffle(chunk) outdat.write("".join(chain(*chunk))) ## start new chunk chunk = [dat] break ## do the last chunk random.shuffle(chunk) outdat.write("".join(chain(*chunk))) indat.close() outdat.close() # In[50]: def build_clusters_from_cigars(data, sample): # get all regions with reads. Generator to yield (str, int, int) fullregions = bedtools_merge(data, sample).strip().split("\n") regions = (i.split("\t") for i in fullregions) regions = ((i, int(j), int(k)) for (i, j, k) in regions) # access reads from bam file using pysam bamfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') # iterate over all regions opath = os.path.join( data.dirs.clusts, "{}.clustS.gz".format(sample.name)) out = gzip.open(opath, 'wt') idx = 0 clusters = [] for reg in regions: # uncomment and compare against ref sequence when testing #ref = get_ref_region(data.paramsdict["reference_sequence"], *reg) reads = bamfile.fetch(*reg) # store reads in a dict rdict = {} # paired-end data cluster building if "pair" in data.paramsdict["datatype"]: # match paired reads together in a dictionary for read in reads: if read.qname not in rdict: rdict[read.qname] = [read, None] else: rdict[read.qname][1] = read # sort keys by derep number keys = sorted( rdict.keys(), key=lambda x: int(x.split("=")[-1]), reverse=True) # build the cluster based on map positions, orientation, cigar clust = [] for key in keys: r1, r2 = rdict[key] if r1 and r2: #lref = len(ref[1]) lref = reg[2] - reg[1] arr1 = np.zeros(lref, dtype="U1") arr2 = np.zeros(lref, dtype="U1") arr1.fill("-") arr2.fill("-") # how far ahead of the start does this read begin seq = cigared(r1.seq, r1.cigar) start = r1.reference_start - reg[1] arr1[start:start + len(seq)] = list(seq) seq = cigared(r2.seq, r2.cigar) start = r2.reference_start - reg[1] arr2[start:start + len(seq)] = list(seq) arr3 = join_arrays(arr1, arr2) pairseq = "".join(arr3) ori = "+" if r1.is_reverse: ori = "-" derep = r1.qname.split("=")[-1] rname = "{}:{}-{};size={};{}".format(*reg, derep, ori) clust.append("{}\n{}".format(rname, pairseq)) # single-end data cluster building else: for read in reads: rdict[read.qname] = read # sort keys by derep number keys = sorted( rdict.keys(), key=lambda x: int(x.split("=")[-1]), reverse=True) # build the cluster based on map positions, orientation, cigar clust = [] for key in keys: r1 = rdict[key] #aref = np.array(list(ref[1])) lref = reg[2] - reg[1] arr1 = np.zeros(lref, dtype="U1") arr1.fill("-") # how far ahead of the start does this read begin seq = cigared(r1.seq, r1.cigar) start = r1.reference_start - reg[1] arr1[start:start + len(seq)] = list(seq) aseq = "".join(arr1) ori = "+" if r1.is_reverse: ori = "-" derep = r1.qname.split("=")[-1] rname = "{}:{}-{};size={};{}".format(*reg, derep, ori) clust.append("{}\n{}".format(rname, aseq)) # store this cluster clusters.append("\n".join(clust)) idx += 1 # if 1000 clusters stored then write to disk if not idx % 10: out.write("\n//\n//\n".join(clusters) + "\n//\n//\n") clusters = [] # write final remaining clusters to disk out.write("\n//\n//\n".join(clusters) + "\n//\n//\n") out.close() # In[51]: build_clusters_from_cigars(data, sample) # In[1]: #maxlens, depths = get_quick_depths(data, sample) # In[2]: #sample_cleanup(data, sample) # In[18]: def get_quick_depths(data, sample): """ iterate over clustS files to get data """ ## use existing sample cluster path if it exists, since this ## func can be used in step 4 and that can occur after merging ## assemblies after step3, and if we then referenced by data.dirs.clusts ## the path would be broken. if sample.files.clusters: pass else: ## set cluster file handles sample.files.clusters = os.path.join( data.dirs.clusts, sample.name + ".clustS.gz") ## get new clustered loci fclust = data.samples[sample.name].files.clusters clusters = gzip.open(fclust, 'rt') pairdealer = izip(*[iter(clusters)] * 2) ## storage depths = [] maxlen = [] ## start with cluster 0 tdepth = 0 tlen = 0 ## iterate until empty while 1: ## grab next try: name, seq = next(pairdealer) except StopIteration: break ## if not the end of a cluster #print name.strip(), seq.strip() if name.strip() == seq.strip(): depths.append(tdepth) maxlen.append(tlen) tlen = 0 tdepth = 0 else: try: tdepth += int(name.strip().split("=")[-1][:-2]) tlen = len(seq) except: print(name) ## return clusters.close() return np.array(maxlen), np.array(depths) # In[14]: def sample_cleanup(data, sample): """ stats, cleanup, and link to samples """ ## get maxlen and depths array from clusters maxlens, depths = get_quick_depths(data, sample) ## Test if depths is non-empty, but just full of zeros. if not depths.max(): print(" no clusters found for {}".format(sample.name)) return else: ## store which min was used to calculate hidepth here sample.stats_dfs.s3["hidepth_min"] = ( data.paramsdict["mindepth_majrule"]) # If our longest sequence is longer than the current max_fragment_len # then update max_fragment_length. For assurance we require that # max len is 4 greater than maxlen, to allow for pair separators. hidepths = depths >= data.paramsdict["mindepth_majrule"] maxlens = maxlens[hidepths] ## Handle the case where there are no hidepth clusters if maxlens.any(): maxlen = int(maxlens.mean() + (2. * maxlens.std())) else: maxlen = 0 if maxlen > data._hackersonly["max_fragment_length"]: data._hackersonly["max_fragment_length"] = maxlen + 4 ## make sense of stats keepmj = depths[depths >= data.paramsdict["mindepth_majrule"]] keepstat = depths[depths >= data.paramsdict["mindepth_statistical"]] ## sample summary stat assignments sample.stats["state"] = 3 sample.stats["clusters_total"] = depths.shape[0] sample.stats["clusters_hidepth"] = keepmj.shape[0] ## store depths histogram as a dict. Limit to first 25 bins bars, bins = np.histogram(depths, bins=range(1, 26)) sample.depths = {int(i): int(v) for i, v in zip(bins, bars) if v} ## sample stat assignments ## Trap numpy warnings ("mean of empty slice") printed by samples ## with few reads. with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) sample.stats_dfs.s3["merged_pairs"] = sample.stats.reads_merged sample.stats_dfs.s3["clusters_total"] = depths.shape[0] try: sample.stats_dfs.s3["clusters_hidepth"] = ( int(sample.stats["clusters_hidepth"])) except ValueError: ## Handle clusters_hidepth == NaN sample.stats_dfs.s3["clusters_hidepth"] = 0 sample.stats_dfs.s3["avg_depth_total"] = depths.mean() sample.stats_dfs.s3["avg_depth_mj"] = keepmj.mean() sample.stats_dfs.s3["avg_depth_stat"] = keepstat.mean() sample.stats_dfs.s3["sd_depth_total"] = depths.std() sample.stats_dfs.s3["sd_depth_mj"] = keepmj.std() sample.stats_dfs.s3["sd_depth_stat"] = keepstat.std() ## Get some stats from the bam files ## This is moderately hackish. samtools flagstat returns ## the number of reads in the bam file as the first element ## of the first line, this call makes this assumption. if not data.paramsdict["assembly_method"] == "denovo": ## shorter names mapf = os.path.join( data.dirs.refmapping, sample.name + "-mapped-sorted.bam") umapf = os.path.join( data.dirs.refmapping, sample.name + "-unmapped.bam") ## get from unmapped cmd1 = [ip.bins.samtools, "flagstat", umapf] proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE) result1 = proc1.communicate()[0] ## get from mapped cmd2 = [ip.bins.samtools, "flagstat", mapf] proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE) result2 = proc2.communicate()[0] ## store results ## If PE, samtools reports the _actual_ number of reads mapped, both ## R1 and R2, so here if PE divide the results by 2 to stay consistent ## with how we've been reporting R1 and R2 as one "read pair" if "pair" in data.paramsdict["datatype"]: sample.stats["refseq_unmapped_reads"] = int(result1.split()[0]) / 2 sample.stats["refseq_mapped_reads"] = int(result2.split()[0]) / 2 else: sample.stats["refseq_unmapped_reads"] = int(result1.split()[0]) sample.stats["refseq_mapped_reads"] = int(result2.split()[0]) unmapped = os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam") samplesam = os.path.join(data.dirs.refmapping, sample.name + ".sam") for rfile in [unmapped, samplesam]: if os.path.exists(rfile): os.remove(rfile) # if loglevel==DEBUG log_level = ip.logger.getEffectiveLevel() if not log_level == 10: ## Clean up loose files only if not in DEBUG ##- edits/*derep, utemp, *utemp.sort, *htemp, *clust.gz derepfile = os.path.join(data.dirs.edits, sample.name + "_derep.fastq") mergefile = os.path.join(data.dirs.edits, sample.name + "_merged_.fastq") uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp") usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort") hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp") clusters = os.path.join(data.dirs.clusts, sample.name + ".clust.gz") for rfile in [derepfile, mergefile, uhandle, usort, hhandle, clusters]: if os.path.exists(rfile): os.remove(rfile) # In[7]: # optimize speed of this next build_clusters_from_cigars(data, sample) # ### Consensus for refmapped data -- store all (even long pairs)? # In[ ]: ### Write each out to a sam file... # newconsensus() # data._este = data.stats.error_est.mean() data._esth = data.stats.hetero_est.mean() clusters = open(os.path.join(data.dirs.clusts, "{}.clustS.gz".format(sample.name)), 'r') clusters.read() # plan to fill an h5 for this sample tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.") with h5py.File(tmp5, 'w') as io5: io5.create_dataset("cats", (optim, maxlen, 4), dtype=np.uint32) io5.create_dataset("alls", (optim, ), dtype=np.uint8) io5.create_dataset("chroms", (optim, 3), dtype=np.int64) ## local copies to use to fill the arrays catarr = io5["cats"][:] nallel = io5["alls"][:] refarr = io5["chroms"][:] # In[ ]: ### Step 6 for refmapped pairs: # 1. convert all sams to bams and make a merged mapped-sorted.bam # 2. get overlapping regions with bedtools_merge() # 3. pull consens reads in aligned regions with bamfile.fetch() # 4. store the consensus sequence in h5. # 5. store the variants in h5. # 6. store the depth of variants in h5. # 7. # In[7]: # get all regions with reads. Generator to yield (str, int, int) fullregions = bedtools_merge(data, sample).strip().split("\n") regions = (i.split("\t") for i in fullregions) regions = ((i, int(j), int(k)) for (i, j, k) in regions) # access reads from bam file using pysam samfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') # In[34]: reg = next(regions) ref = get_ref_region(data.paramsdict["reference_sequence"], *reg) reads = samfile.fetch(*reg) # In[35]: # match paired reads together in a dictionary rdict = {} for read in reads: rdict[read.qname] = read # sort keys by derep number keys = sorted( rdict.keys(), key=lambda x: int(x.split("=")[-1]), reverse=True) # build the cluster based on map positions, orientation, cigar clust = [] for key in keys: r1 = rdict[key] aref = np.array(list(ref[1])) arr1 = np.zeros(aref.size, dtype="U1") arr1.fill("-") # how far ahead of the start does this read begin seq = cigared(r1.seq, r1.cigar) start = r1.reference_start - reg[1] arr1[start:start + len(seq)] = list(seq) aseq = "".join(arr1) ori = "+" if r1.is_reverse: ori = "-" derep = r1.qname.split("=")[-1] rname = "{}:{}-{};size={};{}".format(*reg, derep, ori) clust.append("{}\n{}".format(rname, aseq)) # In[36]: clust # In[30]: rdict # In[16]: mapping_reads(data, sample, 2) # In[ ]: # In[9]: s3.data.run("5") # In[11]: subsamples = list(s3.data.samples.values()) subsamples.sort(key=lambda x: x.stats.clusters_hidepth, reverse=True) jobs = {} # In[8]: from ipyrad.assemble.jointestimate import * optim(data, sample) # In[9]: sample, _, _, nhidepth, maxlen = recal_hidepth(data, sample) # In[6]: data concat_multiple_edits(data, sample) merge_pairs_with_vsearch(data, sample, True) merge_end_to_end(data, sample, True, True) # In[7]: dereplicate(data, sample, 2) # In[8]: cluster(data, sample, 2, 1) # In[9]: build_clusters(data, sample, 5) # In[10]: muscle_chunker(data, sample) # In[14]: for idx in range(10): handle = os.path.join(s3.data.tmpdir, "{}_chunk_{}.ali".format(sample.name, idx)) align_and_parse(handle, 5, 0) # In[15]: reconcat(data, sample) # In[ ]: # In[6]: s3.run() # In[6]: # In[9]: maxlens, depths = get_quick_depths(data, sample) maxlens, depths # In[ ]: # In[7]: data = s3.data samples = list(s3.data.samples.values()) sample = samples[1] # In[8]: sample = samples[1] # In[9]: concat_multiple_edits(data, sample) # In[10]: merge_end_to_end(data, sample, False) # In[11]: dereplicate(data, sample, 2) # In[12]: split_endtoend_reads(data, sample) # In[13]: mapping_reads(data, sample, 2) # In[14]: build_ref_cigars(data, sample) # In[ ]: # In[ ]: #samtools mpileup -uf ref.fa aln1.bam aln2.bam | bcftools view -bvcg - > var.raw.bcf get_ipython().system(' samtools mpileup -ur /home/deren/Dropbox/opbox/Maud/lgeorge.genome.fa') # In[9]: #regions = bedtools_merge(data, sample).strip().split("\n") fullregions = bedtools_merge(data, sample).strip().split("\n") # In[10]: regions = (i.split("\t") for i in fullregions) regions = ((i, int(j), int(k)) for (i, j, k) in regions) # In[ ]: # In[14]: def get_ref_region(reference, contig, rstart, rend): "returns the reference sequence over a given region" cmd = [ ip.bins.samtools, 'faidx', reference, "{}:{}-{}".format(contig, rstart + 1, rend), ] stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate() name, seq = stdout.decode().split("\n", 1) listseq = [name, seq.replace("\n", "")] return listseq # In[527]: def build_ref_cigars(data, sample): # get all regions with reads. Generator to yield (str, int, int) #fullregions = bedtools_merge(data, sample).strip().split("\n") regions = (i.split("\t") for i in fullregions) regions = ((i, int(j), int(k)) for (i, j, k) in regions) # access reads from bam file using pysam samfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') # iterate over all regions out = open("test.clustS", 'w') idx = 0 clusters = [] for reg in regions: ref = get_ref_region(data.paramsdict["reference_sequence"], *reg) reads = samfile.fetch(*reg) # match paired reads together in a dictionary rdict = {} for read in reads: if read.qname not in rdict: rdict[read.qname] = [read, None] else: rdict[read.qname][1] = read # sort keys by derep number keys = sorted( rdict.keys(), key=lambda x: int(x.split("=")[-1]), reverse=True) # build the cluster based on map positions, orientation, cigar clust = [] for key in keys: r1, r2 = rdict[key] if r1 and r2: aref = np.array(list(ref[1])) arr1 = np.zeros(aref.size, dtype="U1") arr2 = np.zeros(aref.size, dtype="U1") arr1.fill("-") arr2.fill("-") try: # how far ahead of the start does this read begin seq = cigared(r1.seq, r1.cigar) start = r1.reference_start - reg[1] arr1[start:start + len(seq)] = list(seq) seq = cigared(r2.seq, r2.cigar) start = r2.reference_start - reg[1] arr2[start:start + len(seq)] = list(seq) arr3 = join_arrays(arr1, arr2) pairseq = "".join(arr3) derep = r1.qname.split("=")[-1] clust.append("{}\n{}".format("{}:{}-{};size={}" .format(*reg, derep), pairseq)) except ValueError: print(reg) clusters.append("\n".join(clust)) idx += 1 if not idx % 1000: out.write("\n//\n//\n".join(clusters)) out.close() # In[549]: def build_ref_cigars(data, sample): # get all regions with reads. Generator to yield (str, int, int) #fullregions = bedtools_merge(data, sample).strip().split("\n") regions = (i.split("\t") for i in fullregions) regions = ((i, int(j), int(k)) for (i, j, k) in regions) # access reads from bam file using pysam samfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') # iterate over all regions opath = os.path.join( data.dirs.refmapping, "{}.clustS.gz".format(sample.name)) out = gzip.open(opath, 'w') idx = 0 clusters = [] for reg in regions: ref = get_ref_region(data.paramsdict["reference_sequence"], *reg) reads = samfile.fetch(*reg) # match paired reads together in a dictionary rdict = {} for read in reads: if read.qname not in rdict: rdict[read.qname] = [read, None] else: rdict[read.qname][1] = read # sort keys by derep number keys = sorted( rdict.keys(), key=lambda x: int(x.split("=")[-1]), reverse=True) # build the cluster based on map positions, orientation, cigar clust = [] for key in keys: r1, r2 = rdict[key] if r1 and r2: aref = np.array(list(ref[1])) arr1 = np.zeros(aref.size, dtype="U1") arr2 = np.zeros(aref.size, dtype="U1") arr1.fill("-") arr2.fill("-") # how far ahead of the start does this read begin seq = cigared(r1.seq, r1.cigar) start = r1.reference_start - reg[1] arr1[start:start + len(seq)] = list(seq) seq = cigared(r2.seq, r2.cigar) start = r2.reference_start - reg[1] arr2[start:start + len(seq)] = list(seq) arr3 = join_arrays(arr1, arr2) pairseq = "".join(arr3) derep = r1.qname.split("=")[-1] rname = "{}:{}-{};size={}".format(*reg, derep) clust.append("{}\n{}".format(rname, pairseq)) clusters.append("\n".join(clust)) idx += 1 if not idx % 100: out.write("\n//\n//\n".join(clusters).encode()) out.close() # In[550]: c = build_ref_cigars(data, sample) # In[547]: "\n//\n//\n".join(c).encode() # In[397]: regions = (i.split("\t") for i in fullregions) regions = ((i, int(j), int(k)) for (i, j, k) in regions) build_ref_cigars(data, sample) # In[438]: reg = ('Contig0', 41920, 42479) reg = ('Contig0', 7219468, 7220326) reg = ('Contig0', 207998, 208219) reg = ('Contig0', 16738, 16933) reg = ('Contig0', 49781, 50005) reg = ('Contig0', 76480, 76711) reg = ('Contig0', 24391, 24908) reg = ('Contig0', 7193, 7952) # has merge #reg = ('Contig0', 13327, 13845) # has indels reg = ('Contig0', 76480, 77015) # has indels reg = ('Contig0', 346131, 346193) # valueerror samfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') ref = get_ref_region(data.paramsdict["reference_sequence"], *reg) reads = list(samfile.fetch(*reg)) # In[519]: def cigared(sequence, cigartups): start = 0 seq = "" for tup in cigartups: flag, add = tup if flag is 0: seq += sequence[start:start + add] if flag is 1: pass if flag is 2: seq += "-" * add start -= add if flag is 4: pass start += add return seq # In[456]: # match paired reads together in a dictionary rdict = {} for read in reads: if read.qname not in rdict: rdict[read.qname] = [read] else: rdict[read.qname].append(read) # sort keys by derep number keys = sorted( rdict.keys(), key=lambda x: int(x.split("=")[-1]), reverse=True) # build the cluster based on map positions, orientation, cigar for key in keys: r1, r2 = rdict[key] if r1 and r2: print(r1.seq, r1.cigar)# r1.pos, r1.aend, r1.rlen, r1.aligned_pairs) print(cigared(r1.seq, r1.cigar)) # In[310]: out = open("test2.txt", 'w') # build the cluster based on map positions, orientation, cigar for key in keys: r1, r2 = rdict[key] if r1 and r2: # empty arrays aref = np.array(list(ref[1])) arr1 = np.zeros(aref.size, dtype="U1") arr2 = np.zeros(aref.size, dtype="U1") arr1.fill("-") arr2.fill("-") # how far ahead of the start does this read begin start = r1.reference_start - reg[1] seq = cigared(r1.seq, r1.cigar) arr1[start:start+len(seq)] = list(seq) seq = cigared(r2.seq, r2.cigar) start = r2.reference_start - reg[1] arr2[start:start+len(seq)] = list(seq) #print("".join(arr1), file=out) #print("".join(arr2), file=out) arr3 = join_arrays(arr1, arr2) print("".join(arr3), file=out) out.close() # In[307]: import numba #numba.jit(nopython=True) def join_arrays(arr1, arr2): arr3 = np.zeros(arr1.size, dtype="U1") for i in range(arr1.size): if arr1[i] == arr2[i]: arr3[i] = arr1[i] elif arr1[i] == "N": if arr2[i] == "-": arr3[i] = "N" else: arr3[i] = arr2[i] elif arr2[i] == "N": if arr1[i] == "-": arr3[i] = "N" else: arr3[i] = arr1[i] elif arr1[i] == "-": if arr2[i] == "N": arr3[i] = "N" else: arr3[i] = arr2[i] elif arr2[i] == "-": if arr1[i] == "N": arr3[i] = "N" else: arr3[i] = arr1[i] else: arr3[i] = "N" return arr3 a1 = np.array(list("AGAGAG-NN----")) a2 = np.array(list("----ACTTNNTTT")) de = np.array(list("AGAGANTTNNTTT")) a1 = np.array(list("AGAGAG-NN-------")) a2 = np.array(list("----ACTTNNTTT---")) de = np.array(list("AGAGANTTNNTTT---")) # In[308]: join_arrays(a1, a2) # In[233]: # how far ahead of the start does this read begin start = r1.reference_start - reg[1] # In[228]: aref = np.array(list(ref[1])) arr1 = np.zeros(aref.size, dtype="U1") arr2 = np.zeros(aref.size, dtype="U1") arr1.fill("-") # In[194]: r1.reference_start, r1.pos, r1.reference_end, r1.cigar # In[202]: cigared(r1.seq, r1.cigar) # In[199]: # which is forward and reverse if r1.blocks[0][0] > r2.blocks[0][0]: rx = r2 r2 = r1 r1 = rx # get arrs aref = np.array(list(ref[1])) arr1 = np.zeros(aref.size, dtype="U1") arr2 = np.zeros(aref.size, dtype="U1") # fill arr1 seq1 = cigared(r1.seq, r1.cigar) arr1[] # In[197]: ref = get_ref_region(data.paramsdict["reference"], *reg) def cigar(r1, r2, reg): # which is forward and reverse if r1.blocks[0][0] > r2.blocks[0][0]: rx = r2 r2 = r1 r1 = rx # get arrs aref = np.array(list(ref[1])) arr1 = np.zeros(aref.size, dtype="U1") arr2 = np.zeros(aref.size, dtype="U1") # fill arr1 # do they overlap? overlap = False if max(r1.blocks[0]) > min(r2.blocks[0]): overlap = True #osegment = r1.blocks[0][1] - r2.blocks[0][0] #oseqs = r1.seq[-osegment:], r2.seq[:osegment] #print(oseqs) # modify for cigar for edit in r1.cigar: r1.seq before = "-" * (r1.pos - reg[1]) if r1.is_reverse: read = before + revcomp(r1.seq) else: read = before + r1.seq midns = "-" * (r2.pos - r1.aend) read += midns if r2.is_reverse: read += r2.seq else: read += revcomp(r2.seq) after = "-" * (reg[2] - r2.aend) read += after return read # In[ ]: # In[ ]: # In[ ]: # In[350]: def get_ref_region(reference, contig, rstart, rend): cmd = [ ip.bins.samtools, 'faidx', reference, "{}:{}-{}".format(contig, rstart+1, rend)] stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate() name, seq = stdout.decode().split("\n", 1) listseq = [name, seq.replace("\n", "")] return listseq # In[250]: ireg = samfile.fetch(*reg) rdict = {}fo for read in ireg: if read.qname not in rdict: rdict[read.qname] = [read] else: rdict[read.qname].append(read) clust = dict_to_clust(rdict) # In[251]: clust # In[252]: ref = get_ref_region(*reg) ref # In[255]: print(ref[1][:80]) print(r1.seq) # In[272]: # how for ahead of reference is r1 start ahead = -1 * (reg[1] - (r1.pos - 1)) # In[274]: print(ref[1][:90]) print("-"*ahead, r1.seq) # In[344]: print('r', ref[1][:90]) for key in rdict: r1, r2 = rdict[key] ahead = -1 * (reg[1] - (r1.pos)) print('h', ("-"*ahead + r1.seq)[:90]) #print(r1.cigar) print('r', ref[1][-90:]) for key in rdict: r1, r2 = rdict[key] ahead = (reg[2] - (r2.pos)) print(reg[2], r2.pos, ahead) print('h', ("-"*ahead)) #print(r1.cigar) #print(r2.seq) #print(r2.get_blocks()) #print(r1.seq) #print(r1.get_blocks()[0]) #print(r1.aend, r2.get_blocks()[0][0]) #print(r1.cigar) print(r2.cigarstring) # In[66]: for read in clust: print(read) # In[30]: # access reads from bam file using pysam samfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') # iterate over all mapped regions clusters = [] for reg in regions: ireg = samfile.fetch(*reg) # match paired reads by read names for all reads in each cluster rdict = {} for read in ireg: if read.qname not in rdict: rdict[read.qname] = [read] else: rdict[read.qname].append(read) clust = dict_to_clust(rdict) # In[31]: rdict # In[29]: from ipyrad.assemble.util import revcomp # In[106]: def build_ref_clusters_from_cigars(data, sample): # get regions from bedtools overlaps regions = bedtools_merge(data, sample).strip().split("\n") regions = (i.split("\t") for i in regions) regions = ((i, int(j), int(k)) for (i, j, k) in regions) # access reads from bam file using pysam samfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') # iterate over all mapped regions clusters = [] for reg in regions: ireg = samfile.fetch(*reg) # match paired reads by read names for all reads in each cluster rdict = {} for read in ireg: if read.qname not in rdict: rdict[read.qname] = [read] else: rdict[read.qname].append(read) return dict_to_clust(rdict) # In[109]: seq = build_ref_clusters_from_cigars(data, sample) seq # In[27]: def dict_to_clust(rdict): clust = [] for pair in rdict.values(): r1, r2 = pair poss = r1.get_reference_positions() + r2.get_reference_positions() seedstart, seedend = min(poss), max(poss) reads_overlap = False #print(r1.cigartuples, r1.cigarstring, r1.cigar) if r1.is_reverse: if r2.aend > r1.get_blocks()[0][0]: reads_overlap = True seq = r2.seq + 'nnnn' + revcomp(r1.seq) else: seq = r2.seq + 'nnnn' + r2.seq else: if r1.aend > r2.get_blocks()[0][0]: reads_overlap = True seq = r1.seq + 'nnnn' + revcomp(r2.seq) else: seq = r1.seq + 'nnnn' + r2.seq clust.append(seq) return clust # In[8]: # build clusters for aligning with muscle from the sorted bam file samfile = AlignmentFile( os.path.join(data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), 'rb') # In[24]: chrom, rstart, rend = regions[0].split() reg = samfile.fetch(chrom, int(rstart), int(rend)) for read in reg: print(rstart) print(read.seq) # In[ ]: # In[9]: nonmerged1 = os.path.join( data.tmpdir, "{}_nonmerged_R1_.fastq".format(sample.name)) nonmerged2 = os.path.join( data.tmpdir, "{}_nonmerged_R2_.fastq".format(sample.name)) edits1 = os.path.join( data.dirs.edits, "{}.trimmed_R1_.fq.gz".format(sample.name)) edits2 = os.path.join( data.dirs.edits, "{}.trimmed_R2_.fq.gz".format(sample.name)) # file precedence nonm1 = [i for i in (edits1, nonmerged1) if os.path.exists(i)][-1] nonm2 = [i for i in (edits2, nonmerged2) if os.path.exists(i)][-1] # In[10]: edits1 # In[11]: infiles = [ os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)), os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)), os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)), os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)), ] infiles = [i for i in infiles if os.path.exists(i)] infile = infiles[-1] # In[15]: split_endtoend_reads(data, sample) # In[11]: with open(inp, 'r') as infile: duo = izip(*[iter(infile)] * 2) idx = 0 while 1: try: itera = next(duo) except StopIteration: break r1, r2 = itera[1].split("nnnn") # In[14]: def split_endtoend_reads(data, sample): """ Takes R1nnnnR2 derep reads from paired data and splits it back into separate R1 and R2 parts for read mapping. """ inp = os.path.join(data.tmpdir, "{}_derep.fastq".format(sample.name)) out1 = os.path.join(data.tmpdir, "{}_R1-tmp.fastq".format(sample.name)) out2 = os.path.join(data.tmpdir, "{}_R2-tmp.fastq".format(sample.name)) splitderep1 = open(out1, 'w') splitderep2 = open(out2, 'w') with open(inp, 'r') as infile: # Read in the infile two lines at a time: (seqname, sequence) duo = izip(*[iter(infile)] * 2) ## lists for storing results until ready to write split1s = [] split2s = [] ## iterate over input splitting, saving, and writing. idx = 0 while 1: try: itera = next(duo) except StopIteration: break ## split the duo into separate parts and inc counter part1, part2 = itera[1].split("nnnn") idx += 1 ## R1 needs a newline, but R2 inherits it from the original file ## store parts in lists until ready to write split1s.append("{}{}\n".format(itera[0], part1)) split2s.append("{}{}".format(itera[0], part2)) ## if large enough then write to file if not idx % 10000: splitderep1.write("".join(split1s)) splitderep2.write("".join(split2s)) split1s = [] split2s = [] ## write final chunk if there is any if any(split1s): splitderep1.write("".join(split1s)) splitderep2.write("".join(split2s)) ## close handles splitderep1.close() splitderep2.close() # In[10]: merge_pairs(data, sample, 1, 1) # In[ ]: # In[8]: s3.data.samples["1A_0"].concatfiles # In[6]: s3.run() # In[28]: def bedtools_merge(data, sample): """ Get all contiguous genomic regions with one or more overlapping reads. This is the shell command we'll eventually run bedtools bamtobed -i 1A_0.sorted.bam | bedtools merge [-d 100] -i : specifies the input file to bed'ize -d : For PE set max distance between reads """ LOGGER.info("Entering bedtools_merge: %s", sample.name) mappedreads = os.path.join(data.dirs.refmapping, sample.name + "-mapped-sorted.bam") ## command to call `bedtools bamtobed`, and pipe output to stdout ## Usage: bedtools bamtobed [OPTIONS] -i ## Usage: bedtools merge [OPTIONS] -i cmd1 = [ip.bins.bedtools, "bamtobed", "-i", mappedreads] cmd2 = [ip.bins.bedtools, "merge", "-i", "-"] ## If PE the -d flag to tell bedtools how far apart to allow mate pairs. ## If SE the -d flag is negative, specifying that SE reads need to ## overlap by at least a specific number of bp. This prevents the ## stairstep syndrome when a + and - read are both extending from ## the same cutsite. Passing a negative number to `merge -d` gets this done. if 'pair' in data.paramsdict["datatype"]: check_insert_size(data, sample) #cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"])) cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"])) cmd2.insert(2, "-d") else: cmd2.insert(2, str(-1 * data._hackersonly["min_SE_refmap_overlap"])) cmd2.insert(2, "-d") ## pipe output from bamtobed into merge LOGGER.info("stdv: bedtools merge cmds: %s %s", cmd1, cmd2) proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE) proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout) result = proc2.communicate()[0].decode() proc1.stdout.close() ## check for errors and do cleanup if proc2.returncode: raise IPyradWarningExit("error in %s: %s", cmd2, result) ## Write the bedfile out, because it's useful sometimes. if os.path.exists(ip.__debugflag__): with open(os.path.join(data.dirs.refmapping, sample.name + ".bed"), 'w') as outfile: outfile.write(result) ## Report the number of regions we're returning nregions = len(result.strip().split("\n")) LOGGER.info("bedtools_merge: Got # regions: %s", nregions) return result # In[27]: bedtools_merge # In[29]: def check_insert_size(data, sample): """ check mean insert size for this sample and update hackersonly.max_inner_mate_distance if need be. This value controls how far apart mate pairs can be to still be considered for bedtools merging downstream. """ ## pipe stats output to grep cmd1 = [ ip.bins.samtools, "stats", os.path.join( data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)), ] cmd2 = ["grep", "SN"] proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE) proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout) res = proc2.communicate()[0].decode() if proc2.returncode: raise IPyradWarningExit("error in %s: %s", cmd2, res) ## starting vals avg_insert = 0 stdv_insert = 0 avg_len = 0 ## iterate over results for line in res.split("\n"): if "insert size average" in line: avg_insert = float(line.split(":")[-1].strip()) elif "insert size standard deviation" in line: ## hack to fix sim data when stdv is 0.0. Shouldn't ## impact real data bcz stdv gets rounded up below stdv_insert = float(line.split(":")[-1].strip()) + 0.1 elif "average length" in line: avg_len = float(line.split(":")[-1].strip()) LOGGER.debug("avg {} stdv {} avg_len {}" .format(avg_insert, stdv_insert, avg_len)) ## If all values return successfully set the max inner mate distance. ## This is tricky. avg_insert is the average length of R1+R2+inner mate ## distance. avg_len is the average length of a read. If there are lots ## of reads that overlap then avg_insert will be close to but bigger than ## avg_len. We are looking for the right value for `bedtools merge -d` ## which wants to know the max distance between reads. if all([avg_insert, stdv_insert, avg_len]): ## If 2 * the average length of a read is less than the average ## insert size then most reads DO NOT overlap if stdv_insert < 5: stdv_insert = 5. if (2 * avg_len) < avg_insert: hack = avg_insert + (3 * np.math.ceil(stdv_insert)) - (2 * avg_len) ## If it is > than the average insert size then most reads DO ## overlap, so we have to calculate inner mate distance a little ## differently. else: hack = (avg_insert - avg_len) + (3 * np.math.ceil(stdv_insert)) ## set the hackerdict value LOGGER.info("stdv: hacked insert size is %s", hack) data._hackersonly["max_inner_mate_distance"] = int(np.math.ceil(hack)) else: ## If something fsck then set a relatively conservative distance data._hackersonly["max_inner_mate_distance"] = 300 LOGGER.debug("inner mate distance for {} - {}".format(sample.name,\ data._hackersonly["max_inner_mate_distance"])) # In[33]: #from ipyrad.assemble.refmap import * data = s3.data samples = list(data.samples.values()) sample = samples[0] regions = bedtools_merge(data, sample).strip().split("\n") # In[34]: nregions = len(regions) chunksize = (nregions / 10) + (nregions % 10) # In[43]: from pysam import AlignmentFile sample.files.mapped_reads = os.path.join( data.dirs.refmapping, sample.name + "-mapped-sorted.bam") samfile = AlignmentFile(sample.files.mapped_reads, 'rb') #"./tortas_refmapping/PZ70-mapped-sorted.bam", "rb") # In[97]: regions[1] # In[170]: samfile = pysam.AlignmentFile("ex1.sam", "r") # In[262]: a = pysam.AlignmentFile("./3-refsetest_refmapping/1A_0.sam") # In[287]: nthreads = 2 cmd1 = [ ip.bins.bwa, "mem", "-t", str(max(1, nthreads)), "-M", data.paramsdict['reference_sequence'], sample.concatfiles[0][0], sample.concatfiles[0][1] or "", ] cmd1 = [i for i in cmd1 if i] # Insert optional flags for bwa bwa_args = data._hackersonly["bwa_args"].split() bwa_args.reverse() for arg in bwa_args: cmd1.insert(2, arg) with open(sample.files.sam, 'wb') as outfile: proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile) error1 = proc1.communicate()[0] if proc1.returncode: raise IPyradError(error1) # In[288]: def bwa_map(data, sample, nthreads, force): """ Map reads to reference sequence. This reads in the fasta files (samples.files.edits), and maps each read to the reference. Unmapped reads are dropped right back in the de novo pipeline. Mapped reads end up in a sam file. """ sample.files.sam = os.path.join( data.dirs.refmapping, "{}.sam".format(sample.name)) sample.files.mapped_reads = os.path.join( data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)) sample.files.unmapped_bam = os.path.join( data.dirs.refmapping, "{}-unmapped.bam".format(sample.name)) sample.files.unmapped_reads = os.path.join( data.dirs.refmapping, "{}-unmapped.fastq".format(sample.name)) # (cmd1) bwa mem [OPTIONS] [] # -t # : Number of threads # -M : Mark split alignments as secondary. # (cmd2) samtools view [options] || [region ...] # -b = write to .bam # -q = Only keep reads with mapq score >= 30 (seems to be pretty standard) # -F = Select all reads that DON'T have these flags. # 0x4 (segment unmapped) # 0x100 (Secondary alignment) # 0x800 (supplementary alignment) # -U = Write out all reads that don't pass the -F filter # (all unmapped reads go to this file). # (cmd3) samtools sort [options...] [in.bam] # -T = Temporary file name, this is required by samtools, ignore it # Here we hack it to be samhandle.tmp cuz samtools cleans it up # -O = Output file format, in this case bam # -o = Output file name # (cmd5) samtools bam2fq -v 45 [in.bam] # -v45 set the default qscore arbirtrarily high # cmd1 = [ ip.bins.bwa, "mem", "-t", str(max(1, nthreads)), "-M", data.paramsdict['reference_sequence'], sample.concatfiles[0][0], sample.concatfiles[0][1] or "", ] cmd1 = [i for i in cmd1 if i] # Insert optional flags for bwa bwa_args = data._hackersonly["bwa_args"].split() bwa_args.reverse() for arg in bwa_args: cmd1.insert(2, arg) with open(sample.files.sam, 'wb') as outfile: proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile) error1 = proc1.communicate()[0] if proc1.returncode: raise IPyradError(error1) # sends unmapped reads to a files and will PIPE mapped reads to cmd3 cmd2 = [ ip.bins.samtools, "view", "-b", "-F", "0x904", "-U", sample.files.unmapped_bam, sample.files.sam, ] # this is gonna catch mapped bam output from cmd2 and write to file cmd3 = [ ip.bins.samtools, "sort", "-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"), "-O", "bam", "-o", sample.files.mapped_reads] # Later we're gonna use samtools to grab out regions using 'view' cmd4 = [ipyrad.bins.samtools, "index", sample.files.mapped_reads] # convert unmapped reads to fastq cmd5 = [ ip.bins.samtools, "bam2fq", "-v 45", sample.files.unmapped_bam, ] # Insert additional arguments for paired data to the commands. # We assume Illumina paired end reads for the orientation # of mate pairs (orientation: ---> <----). if 'pair' in data.paramsdict["datatype"]: # add samtools filter for only keep if both pairs hit # 0x1 - Read is paired # 0x2 - Each read properly aligned cmd2.insert(2, "0x3") cmd2.insert(2, "-f") # tell bam2fq that there are output files for each read pair cmd5.insert(2, os.path.join( data.dirs.edits, sample.name + "-tmp-umap1.fastq")) cmd5.insert(2, "-1") cmd5.insert(2, os.path.join( data.dirs.edits, sample.name + "-tmp-umap2.fastq")) cmd5.insert(2, "-2") else: cmd5.insert(2, sample.files.unmapped_reads) cmd5.insert(2, "-0") # cmd2 writes to sname.unmapped.bam and fills pipe with mapped BAM data LOGGER.debug(" ".join(cmd2)) proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE) # cmd3 pulls mapped BAM from pipe and writes to sname.mapped-sorted.bam LOGGER.debug(" ".join(cmd3)) proc3 = sps.Popen(cmd3, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout) error3 = proc3.communicate()[0] if proc3.returncode: raise IPyradWarningExit(error3) proc2.stdout.close() # cmd4 indexes the bam file LOGGER.debug(" ".join(cmd4)) proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE) error4 = proc4.communicate()[0] if proc4.returncode: raise IPyradWarningExit(error4) # Running cmd5 writes to either edits/sname-refmap_derep.fastq for SE # or it makes edits/sname-tmp-umap{12}.fastq for paired data, which # will then need to be merged. LOGGER.debug(" ".join(cmd5)) proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE) error5 = proc5.communicate()[0] if proc5.returncode: raise IPyradWarningExit(error5) # In[289]: bwa_map(data, sample, 2, 1) # In[ ]: # In[205]: cmd1 = [ ip.bins.bwa, "mem", "-t", str(max(1, 2)), "-M", data.paramsdict['reference_sequence'], sample.concatfiles[0][0], sample.concatfiles[0][1] or "", ] proc1 = sps.Popen(cmd1)#, stderr=cmd1_stderr, stdout=cmd1_stdout) proc1.communicate() # In[ ]: pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam") # In[182]: iterreg = samfile.fetch("MT", 5009, 5100) iterreg = samfile.fetch("MT", 10109, 10200) iterreg = samfile.fetch("MT", 285510, 285600) # In[181]: for read in iterreg: print(read.seq) print(read.compare(read.get_reference_sequence())) # In[104]: it = samfile.pileup("MT", 5009, 5100) pysam.Pileup. # In[102]: for read in iterreg: print(read.aligned_pairs) # In[87]: rdict = {} for read in iterreg: if read.qname not in rdict: rdict[read.qname] = read # In[80]: read.seq # In[84]: read.cigar # In[77]: sorted( rdict.keys(), key=lambda x: int(x.split("")) # In[65]: it = samfile.pileup('MT', 5009, 5100) # In[71]: clusts = [] nclusts = 0 for region in regions: chrom, pos1, pos2 = region.split() clust = fetch_cluster_se(data, samfile, chrom, int(pos1), int(pos2)) # In[ ]: def fetch_cluster_se(data, samfile, chrom, rstart, rend): """ Builds a single end cluster from the refmapped data. """ ## If SE then we enforce the minimum overlap distance to avoid the ## staircase syndrome of multiple reads overlapping just a little. overlap_buffer = data._hackersonly["min_SE_refmap_overlap"] ## the *_buff variables here are because we have to play patty ## cake here with the rstart/rend vals because we want pysam to ## enforce the buffer for SE, but we want the reference sequence ## start and end positions to print correctly for downstream. rstart_buff = rstart + overlap_buffer rend_buff = rend - overlap_buffer ## Reads that map to only very short segements of the reference ## sequence will return buffer end values that are before the ## start values causing pysam to complain. Very short mappings. if rstart_buff > rend_buff: tmp = rstart_buff rstart_buff = rend_buff rend_buff = tmp ## Buffering can't make start and end equal or pysam returns nothing. if rstart_buff == rend_buff: rend_buff += 1 ## store pairs rdict = {} clust = [] iterreg = [] iterreg = samfile.fetch(chrom, rstart_buff, rend_buff) ## use dict to match up read pairs for read in iterreg: if read.qname not in rdict: rdict[read.qname] = read ## sort dict keys so highest derep is first ('seed') sfunc = lambda x: int(x.split(";size=")[1].split(";")[0]) rkeys = sorted(rdict.keys(), key=sfunc, reverse=True) ## get blocks from the seed for filtering, bail out if seed is not paired try: read1 = rdict[rkeys[0]] except ValueError: LOGGER.error("Found bad cluster, skipping - key:{} rdict:{}".format(rkeys[0], rdict)) return "" ## the starting blocks for the seed poss = read1.get_reference_positions(full_length=True) seed_r1start = min(poss) seed_r1end = max(poss) ## store the seed ------------------------------------------- if read1.is_reverse: seq = revcomp(read1.seq) else: seq = read1.seq ## store, could write orient but just + for now. size = sfunc(rkeys[0]) clust.append(">{}:{}:{};size={};*\n{}"\ .format(chrom, seed_r1start, seed_r1end, size, seq)) ## If there's only one hit in this region then rkeys will only have ## one element and the call to `rkeys[1:]` will raise. Test for this. if len(rkeys) > 1: ## store the hits to the seed ------------------------------- for key in rkeys[1:]: skip = False try: read1 = rdict[key] except ValueError: ## enter values that will make this read get skipped read1 = rdict[key][0] skip = True ## orient reads only if not skipping if not skip: poss = read1.get_reference_positions(full_length=True) minpos = min(poss) maxpos = max(poss) ## store the seq if read1.is_reverse: seq = revcomp(read1.seq) else: seq = read1.seq ## store, could write orient but just + for now. size = sfunc(key) clust.append(">{}:{}:{};size={};+\n{}"\ .format(chrom, minpos, maxpos, size, seq)) else: ## seq is excluded, though, we could save it and return ## it as a separate cluster that will be aligned separately. pass return clust # In[62]: cmd1 = [ ip.bins.bwa, "mem", "-t", str(max(1, nthreads)), "-M", data.paramsdict['reference_sequence'], ] cmd1 += [i for i in sample.concatfiles[0] if i] # Insert optional flags for bwa bwa_args = data._hackersonly["bwa_args"].split() bwa_args.reverse() for arg in bwa_args: cmd1.insert(2, arg) cmd1_stdout_handle = os.path.join( data.dirs.refmapping, sample.name + ".sam") cmd1_stdout = open(cmd1_stdout_handle, 'w') cmd1_stderr = None proc1 = sps.Popen(cmd1, stderr=cmd1_stderr, stdout=cmd1_stdout) # In[63]: error1 = proc1.communicate()[0] # In[65]: cmd2 = [ ip.bins.samtools, "view", "-b", #"-q", "30", "-F", "0x904", "-U", os.path.join( data.dirs.refmapping, sample.name + "-unmapped.bam"), os.path.join(data.dirs.refmapping, sample.name + ".sam")] # In[69]: proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE) #proc2.communicate() # In[74]: proc3 = sps.Popen(cmd3, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout) proc3.communicate() # In[77]: proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE) proc4.communicate() # In[81]: sample.files.unmapped_reads = os.path.join( data.dirs.refmapping, "{}-unmapped.fastq".format(sample.name)) cmd4 = [ip.bins.samtools, "index", sample.files.mapped_reads] # this is gonna read in the unmapped files, args are added below, # and it will output fastq formatted unmapped reads for merging. # -v 45 sets the default qscore arbitrarily high cmd5 = [ ip.bins.samtools, "bam2fq", "-v 45", os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam")] # Insert additional arguments for paired data to the commands. # We assume Illumina paired end reads for the orientation # of mate pairs (orientation: ---> <----). if 'pair' in data.paramsdict["datatype"]: # add samtools filter for only keep if both pairs hit # 0x1 - Read is paired # 0x2 - Each read properly aligned cmd2.insert(2, "0x3") cmd2.insert(2, "-f") # tell bam2fq that there are output files for each read pair cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap1.fastq")) cmd5.insert(2, "-1") cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap2.fastq")) cmd5.insert(2, "-2") else: cmd5.insert(2, sample.files.unmapped_reads) cmd5.insert(2, "-0") # In[82]: cmd5 # In[83]: proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE) error5 = proc5.communicate()[0] # In[73]: sample.files.mapped_reads = os.path.join( data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)) # this is gonna catch mapped bam output from cmd2 and write to file cmd3 = [ ip.bins.samtools, "sort", "-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"), "-O", "bam", "-o", sample.files.mapped_reads] cmd3 # In[53]: data.dirs.refmapping # In[17]: nthreads = 2 cmd1 = [ ip.bins.bwa, "mem", "-t", str(max(1, nthreads)), "-M", data.paramsdict['reference_sequence'] ] cmd1 #cmd1 += sample.files.dereps # In[ ]: # In[4]: s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient) sample = s3.samples[0] s3.run() # In[7]: #derep_sort_map(s3.data, sample, s3.force, s3.nthreads) sample.concatfiles = concat_multiple_edits(s3.data, sample) # In[13]: sample.mergedfile = merge_pairs(s3.data, sample, 1, 1) sample.mergedfile # In[12]: new_derep_and_sort(s3.data, sample.mergedfile, os.path.join(s3.data.tmpdir, sample.name+ "_derep.fastq"), s3.nthreads) # In[ ]: strand = "plus" if s3.data.paramsdict["datatype"] is ('gbs' or '2brad'): strand = "both" cmd = [ ip.bins.vsearch, "--derep_fulllength", sample.mergedfile, "--strand", strand, "--output", outfile, "--threads", str(nthreads), "--fasta_width", str(0), "--fastq_qmax", "1000", "--sizeout", "--relabel_md5", ] # In[14]: s3.data.run("4") # In[14]: s3.remote_run_dereps() #s3.remote_run_cluster_map_build() # In[15]: for sample in s3.samples: cluster(s3.data, sample, s3.nthreads, s3.force) # In[8]: for sample in s3.samples: build_clusters(s3.data, sample, s3.maxindels) # In[26]: for sample in s3.samples: muscle_chunker(s3.data, sample) # In[33]: aasyncs = {} for sample in s3.samples: aasyncs[sample.name] = [] for idx in range(10): handle = os.path.join(s3.data.tmpdir, "{}_chunk_{}.ali".format(sample.name, idx)) align_and_parse(handle, s3.maxindels, s3.gbs) # In[32]: def _get_derep_num(read): "return the number of replicates in a derep read" return int(read.split("=")[-1].split("\n")[0][:-1]) def _aligned_indel_filter(clust, max_internal_indels): """ checks for too many internal indels in muscle aligned clusters """ ## make into list lclust = clust.split() ## paired or not try: seq1 = [i.split("nnnn")[0] for i in lclust[1::2]] seq2 = [i.split("nnnn")[1] for i in lclust[1::2]] intindels1 = [i.rstrip("-").lstrip("-").count("-") for i in seq1] intindels2 = [i.rstrip("-").lstrip("-").count("-") for i in seq2] intindels = intindels1 + intindels2 if max(intindels) > max_internal_indels: return 1 except IndexError: seq1 = lclust[1::2] intindels = [i.rstrip("-").lstrip("-").count("-") for i in seq1] if max(intindels) > max_internal_indels: return 1 return 0 # In[34]: for sample in s3.samples: reconcat(s3.data, sample) # In[27]: def align_and_parse(handle, max_internal_indels=5, is_gbs=False): """ much faster implementation for aligning chunks """ ## data are already chunked, read in the whole thing. bail if no data. try: with open(handle, 'rb') as infile: clusts = infile.read().decode().split("//\n//\n") # remove any empty spots clusts = [i for i in clusts if i] # Skip entirely empty chunks if not clusts: raise IPyradError("no clusters in file: {}".format(handle)) except (IOError, IPyradError): LOGGER.debug("skipping empty chunk - {}".format(handle)) return 0 ## count discarded clusters for printing to stats later highindels = 0 ## iterate over clusters sending each to muscle, splits and aligns pairs aligned = _persistent_popen_align3(clusts, 200, is_gbs) ## store good alignments to be written to file refined = [] ## filter and trim alignments for clust in aligned: # check for too many internal indels if not _aligned_indel_filter(clust, max_internal_indels): refined.append(clust) else: highindels += 1 ## write to file after if refined: outhandle = handle.rsplit(".", 1)[0] + ".aligned" with open(outhandle, 'wb') as outfile: outfile.write(str.encode("\n//\n//\n".join(refined) + "\n")) ## remove the old tmp file if not LOGGER.getEffectiveLevel() == 10: os.remove(handle) return highindels # In[28]: def _persistent_popen_align3(clusts, maxseqs=200, is_gbs=False): """ keeps a persistent bash shell open and feeds it muscle alignments """ ## create a separate shell for running muscle in, this is much faster ## than spawning a separate subprocess for each muscle call proc = sps.Popen( ["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0, ) ## iterate over clusters in this file until finished aligned = [] for clust in clusts: ## new alignment string for read1s and read2s align1 = [] align2 = [] ## don't bother aligning if only one seq if clust.count(">") == 1: aligned.append(clust.replace(">", "").strip()) else: # do we need to split the alignment? (is there a PE insert?) try: # make into list (only read maxseqs lines, 2X cuz names) lclust = clust.split()[:maxseqs * 2] # try to split cluster list at nnnn separator for each read lclust1 = list(chain(*zip( lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]]))) lclust2 = list(chain(*zip( lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]]))) # put back into strings clust1 = "\n".join(lclust1) clust2 = "\n".join(lclust2) # Align the first reads. # The muscle command with alignment as stdin and // as split cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(clust1, ip.bins.muscle, "//\n")) # send cmd1 to the bash shell proc.stdin.write(cmd1.encode()) # read the stdout by line until splitter is reached # meaning that the alignment is finished. for line in iter(proc.stdout.readline, b'//\n'): align1.append(line.decode()) # Align the second reads. # The muscle command with alignment as stdin and // as split cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(clust2, ip.bins.muscle, "//\n")) # send cmd2 to the bash shell proc.stdin.write(cmd2.encode()) # read the stdout by line until splitter is reached # meaning that the alignment is finished. for line in iter(proc.stdout.readline, b'//\n'): align2.append(line.decode()) # join up aligned read1 and read2 and ensure names order match lines1 = "".join(align1)[1:].split("\n>") lines2 = "".join(align2)[1:].split("\n>") dalign1 = dict([i.split("\n", 1) for i in lines1]) dalign2 = dict([i.split("\n", 1) for i in lines2]) # sort the first reads keys = list(dalign1.keys()) seed = [i for i in keys if i[-1] == "*"][0] keys.pop(keys.index(seed)) order = [seed] + sorted( keys, key=_get_derep_num, reverse=True) # combine in order alignpe = [] for key in order: alignpe.append("\n".join([ key, dalign1[key].replace("\n", "") + "nnnn" + \ dalign2[key].replace("\n", "")])) ## append aligned cluster string aligned.append("\n".join(alignpe).strip()) # Malformed clust. Dictionary creation with only 1 element except ValueError as inst: ip.logger.debug( "Bad PE cluster - {}\nla1 - {}\nla2 - {}" .format(clust, lines1, lines2)) ## Either reads are SE, or at least some pairs are merged. except IndexError: # limit the number of input seqs # use lclust already built before checking pairs lclust = "\n".join(clust.split()[:maxseqs * 2]) # the muscle command with alignment as stdin and // as splitter cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(lclust, ip.bins.muscle, "//\n")) ## send cmd to the bash shell (TODO: PIPE could overflow here!) proc.stdin.write(cmd.encode()) ## read the stdout by line until // is reached. This BLOCKS. for line in iter(proc.stdout.readline, b'//\n'): align1.append(line.decode()) ## remove '>' from names, and '\n' from inside long seqs lines = "".join(align1)[1:].split("\n>") ## find seed of the cluster and put it on top. seed = [i for i in lines if i.split(";")[-1][0] == "*"][0] lines.pop(lines.index(seed)) lines = [seed] + sorted( lines, key=_get_derep_num, reverse=True) ## format remove extra newlines from muscle aa = [i.split("\n", 1) for i in lines] align1 = [i[0] + '\n' + "".join([j.replace("\n", "") for j in i[1:]]) for i in aa] # trim edges in sloppy gbs/ezrad data. # Maybe relevant to other types too... if is_gbs: align1 = _gbs_trim(align1) ## append to aligned aligned.append("\n".join(align1)) # cleanup proc.stdout.close() if proc.stderr: proc.stderr.close() proc.stdin.close() proc.wait() ## return the aligned clusters return aligned # In[35]: for sample in s3.samples: ip.assemble.clustmap._sample_cleanup(s3.data, sample) # In[36]: ip.assemble.clustmap._get_quick_depths(s3.data, sample) # In[37]: if sample.files.get('clusters'): pass # In[43]: fclust = data.samples[sample.name].files.clusters clusters = gzip.open(fclust, 'rt') pairdealer = izip(*[iter(clusters)] * 2) ## storage depths = [] maxlen = [] ## start with cluster 0 tdepth = 0 tlen = 0 ## iterate until empty while 1: ## grab next try: name, seq = next(pairdealer) except StopIteration: break ## if not the end of a cluster #print name.strip(), seq.strip() #print(name) if name.strip() == seq.strip(): depths.append(tdepth) maxlen.append(tlen) tlen = 0 tdepth = 0 else: tdepth += int(name.strip().split("=")[-1][:-1]) tlen = len(seq) # In[12]: sample.name # In[ ]: # In[6]: s3.remote_run_dereps() # In[7]: s3.remote_run_cluster_map_build() # In[8]: s3.remote_run_align_cleanup() # In[90]: handle = "pairtest-tmpalign/1A_0_chunk_0.ali" with open(handle, 'rb') as infile: clusts = infile.read().decode().split("//\n//\n") # remove any empty spots clusts = [i for i in clusts if i] # Skip entirely empty chunks if not clusts: raise IPyradError("no clusters in file: {}".format(handle)) # In[91]: maxseqs = 200 is_gbs = False # In[ ]: proc = sps.Popen( ["bash"], stdin=sps.PIPE, stdout=sps.PIPE, bufsize=0, ) ## iterate over clusters in this file until finished aligned = [] for clust in clusts: ## new alignment string for read1s and read2s align1 = [] align2 = [] ## don't bother aligning if only one seq if clust.count(">") == 1: aligned.append(clust.replace(">", "").strip()) else: # do we need to split the alignment? (is there a PE insert?) try: # make into list (only read maxseqs lines, 2X cuz names) lclust = clust.split()[:maxseqs * 2] # try to split cluster list at nnnn separator for each read lclust1 = list(chain(*zip( lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]]))) lclust2 = list(chain(*zip( lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]]))) # put back into strings clust1 = "\n".join(lclust1) clust2 = "\n".join(lclust2) # Align the first reads. # The muscle command with alignment as stdin and // as split cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(clust1, ip.bins.muscle, "//\n")) # send cmd1 to the bash shell proc.stdin.write(cmd1.encode()) # read the stdout by line until splitter is reached # meaning that the alignment is finished. for line in iter(proc.stdout.readline, '//\n'): align1.append(line) # Align the second reads. # The muscle command with alignment as stdin and // as split cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(clust2, ip.bins.muscle, "//\n")) # send cmd2 to the bash shell proc.stdin.write(cmd2.encode()) # read the stdout by line until splitter is reached # meaning that the alignment is finished. for line in iter(proc.stdout.readline, b'//\n'): align2.append(line) # join up aligned read1 and read2 and ensure names order match lines1 = "".join(align1)[1:].split("\n>") lines2 = "".join(align2)[1:].split("\n>") dalign1 = dict([i.split("\n", 1) for i in lines1]) dalign2 = dict([i.split("\n", 1) for i in lines2]) # sort the first reads keys = list(dalign1.keys()) seed = [i for i in keys if i[-1] == "*"][0] keys.pop(keys.index(seed)) order = [seed] + sorted( keys, key=_get_derep_num, reverse=True) # combine in order for key in order: align1.append("\n".join([ key, dalign1[key].replace("\n", "") + "nnnn" + \ dalign2[key].replace("\n", "")])) ## append aligned cluster string aligned.append("\n".join(align1).strip()) # Malformed clust. Dictionary creation with only 1 element except ValueError as inst: ip.logger.debug( "Bad PE cluster - {}\nla1 - {}\nla2 - {}" .format(clust, lines1, lines2)) ## Either reads are SE, or at least some pairs are merged. except IndexError: # limit the number of input seqs # use lclust already built before checking pairs lclust = "\n".join(clust.split()[:maxseqs * 2]) # the muscle command with alignment as stdin and // as splitter cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}" .format(lclust, ip.bins.muscle, "//\n")) ## send cmd to the bash shell (TODO: PIPE could overflow here!) proc.stdin.write(cmd.encode()) ## read the stdout by line until // is reached. This BLOCKS. for line in iter(proc.stdout.readline, b'//\n'): align1.append(line.decode()) ## remove '>' from names, and '\n' from inside long seqs lines = "".join(align1)[1:].split("\n>") ## find seed of the cluster and put it on top. seed = [i for i in lines if i.split(";")[-1][0] == "*"][0] lines.pop(lines.index(seed)) lines = [seed] + sorted( lines, key=_get_derep_num, reverse=True) ## format remove extra newlines from muscle aa = [i.split("\n", 1) for i in lines] align1 = [i[0] + '\n' + "".join([j.replace("\n", "") for j in i[1:]]) for i in aa] # trim edges in sloppy gbs/ezrad data. # Maybe relevant to other types too... if is_gbs: align1 = _gbs_trim(align1) ## append to aligned aligned.append("\n".join(align1)) # cleanup proc.stdout.close() if proc.stderr: proc.stderr.close() proc.stdin.close() proc.wait() ## return the aligned clusters #return aligned # In[ ]: # In[37]: from ipyrad.assemble.clustmap import _get_derep_num # In[80]: # join up aligned read1 and read2 and ensure names order match lines1 = "".join(align1)[1:].split("\n>") lines2 = "".join(align2)[1:].split("\n>") dalign1 = dict([i.split("\n", 1) for i in lines1]) dalign2 = dict([i.split("\n", 1) for i in lines2]) # sort the first reads keys = list(dalign1.keys()) seed = [i for i in keys if i[-1] == "*"][0] keys.pop(keys.index(seed)) order = [seed] + sorted( keys, key=_get_derep_num, reverse=True) # In[82]: # join up aligned read1 and read2 and ensure names order match lines1 = "".join(align1)[1:].split("\n>") lines2 = "".join(align2)[1:].split("\n>") dalign1 = dict([i.split("\n", 1) for i in lines1]) dalign2 = dict([i.split("\n", 1) for i in lines2]) # sort the first reads keys = list(dalign1.keys()) seed = [i for i in keys if i[-1] == "*"][0] keys.pop(keys.index(seed)) order = [seed] + sorted( keys, key=_get_derep_num, reverse=True) # combine in order for key in order: align1.append("\n".join([ key, dalign1[key].replace("\n", "") + "nnnn" + \ dalign2[key].replace("\n", "")])) ## append aligned cluster string aligned.append("\n".join(align1).strip()) # In[83]: aligned # In[57]: ## remove '>' from names, and '\n' from inside long seqs lines1 = "".join(align1)[1:].split("\n>") seed = [i for i in lines1 if i.split(";")[-1][0] == "*"][0] lines1.pop(lines1.index(seed)) lines1 = [seed] + sorted( lines1, key=_get_derep_num, reverse=True) dalign1 = dict([i.split("\n", 1) for i in lines1]) lines2 = "".join(align2)[1:].split("\n>") dalign2 = dict([i.split("\n", 1) for i in lines2]) #seed = [i for i in lines2 if i.split(";")[-1][0] == "*"][0] #lines2.pop(lines2.index(seed)) # In[24]: ## format remove extra newlines from muscle aa = [i.split("\n", 1) for i in lines] align1 = [i[0] + '\n' + "".join([j.replace("\n", "") for j in i[1:]]) for i in aa] # In[25]: align1 # In[16]: derephandle = os.path.join(data.tmpdir, sample.name + "_derep.fastq") uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp") temphandle = os.path.join(data.dirs.clusts, sample.name + ".htemp") # In[19]: derepfile = os.path.join(data.tmpdir, sample.name + "_derep.fastq") uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp") usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort") hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp") sample.files.clusters = os.path.join( data.dirs.clusts, sample.name + ".clust.gz") clustsout = gzip.open(sample.files.clusters, 'wt') # In[20]: cmd = ["sort", "-k", "2", uhandle, "-o", usort] proc = sps.Popen(cmd, close_fds=True) proc.communicate()[0] # In[21]: alldereps = {} with open(derepfile, 'rt') as ioderep: dereps = izip(*[iter(ioderep)] * 2) for namestr, seq in dereps: nnn, sss = [i.strip() for i in (namestr, seq)] alldereps[nnn[1:]] = sss # In[25]: seedsseen = set() maxindels = 8 ## Iterate through the usort file grabbing matches to build clusters with open(usort, 'rt') as insort: ## iterator, seed null, seqlist null isort = iter(insort) lastseed = 0 fseqs = [] seqlist = [] seqsize = 0 while 1: ## grab the next line try: hit, seed, _, ind, ori, _ = next(isort).strip().split() except StopIteration: break ## same seed, append match if seed != lastseed: seedsseen.add(seed) ## store the last cluster (fseq), count it, and clear fseq if fseqs: ## sort fseqs by derep after pulling out the seed fseqs = [fseqs[0]] + sorted(fseqs[1:], key=lambda x: int(x.split(";size=")[1].split(";")[0]), reverse=True) seqlist.append("\n".join(fseqs)) seqsize += 1 fseqs = [] # occasionally write/dump stored clusters to file and clear mem if not seqsize % 10000: if seqlist: clustsout.write( "\n//\n//\n".join(seqlist) + "\n//\n//\n") ## reset list and counter seqlist = [] ## store the new seed on top of fseq list fseqs.append(">{}*\n{}".format(seed, alldereps[seed])) lastseed = seed ## add match to the seed ## revcomp if orientation is reversed (comp preserves nnnn) if ori == "-": seq = comp(alldereps[hit])[::-1] else: seq = alldereps[hit] ## only save if not too many indels if int(ind) <= maxindels: fseqs.append(">{}{}\n{}".format(hit, ori, seq)) else: ip.logger.info("filtered by maxindels: %s %s", ind, seq) ## write whatever is left over to the clusts file if fseqs: seqlist.append("\n".join(fseqs)) if seqlist: clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n") ## now write the seeds that had no hits. Make dict from htemp with open(hhandle, 'rt') as iotemp: nohits = izip(*[iter(iotemp)] * 2) seqlist = [] seqsize = 0 while 1: try: nnn, _ = [i.strip() for i in next(nohits)] except StopIteration: break ## occasionally write to file if not seqsize % 10000: if seqlist: clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n") ## reset list and counter seqlist = [] ## append to list if new seed if nnn[1:] not in seedsseen: seqlist.append("{}*\n{}".format(nnn, alldereps[nnn[1:]])) seqsize += 1 ## write whatever is left over to the clusts file if seqlist: clustsout.write("\n//\n//\n".join(seqlist)) ## close the file handle clustsout.close() del alldereps # In[28]: seedsseen # In[6]: sample.concatfiles = concat_multiple_edits(data, sample) # In[7]: sample.mergedfile = merge_pairs(data, sample, 1, 1) # In[10]: new_derep_and_sort( data, sample.mergedfile, os.path.join(data.tmpdir, sample.name + "_derep.fastq"), 2) # In[6]: data = data sample = sample revcomp = 1 vsearch_merge = 1 sample.concatfiles = concat_multiple_edits(data, sample) sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq") # In[8]: sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq") if 'pair' in data.paramsdict['datatype']: if "reference" not in data.paramsdict["assembly_method"]: nmerged = ip.assemble.clustmap._merge_pairs(data, sample, 1, 1) else: nmerged = 0 # _merge_pairs(data, sample, 0, 0) sample.stats.reads_merged = nmerged # In[ ]: new_derep_and_sort(data, sampl) # In[10]: def new_derep_and_sort(data, infile, outfile, nthreads): """ Dereplicates reads and sorts so reads that were highly replicated are at the top, and singletons at bottom, writes output to derep file. Paired reads are dereplicated as one concatenated read and later split again. Updated this function to take infile and outfile to support the double dereplication that we need for 3rad (5/29/15 iao). """ ## datatypes options strand = "plus" if data.paramsdict["datatype"] is ('gbs' or '2brad'): strand = "both" ## do dereplication with vsearch cmd = [ ip.bins.vsearch, "--derep_fulllength", infile, "--strand", strand, "--output", outfile, "--threads", str(nthreads), "--fasta_width", str(0), "--fastq_qmax", "1000", "--sizeout", "--relabel_md5", ] ip.logger.info("derep cmd %s", cmd) ## build PIPEd job proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True) errmsg = proc.communicate()[0] if proc.returncode: ip.logger.error("error inside derep_and_sort %s", errmsg) raise IPyradWarningExit(errmsg) # In[ ]: new_derep_and_sort() # In[43]: ## CONCAT FILES FOR MERGED ASSEMBLIES mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq") sample.files.edits = concat_multiple_edits(data, sample) # In[41]: sample.files.edits = [(mergefile, )] nmerged = ip.assemble.clustmap._merge_pairs( data, sample.files.edits, mergefile, 1, 1) sample.stats.reads_merged = nmerged # In[33]: mergefile # In[44]: sample.files.edits # In[5]: s3.run() # In[5]: # muscle align returns values for bad alignments ip.assemble.cluster_within.sample_cleanup(data, samples[0]) # In[6]: data._build_stat("s3") # In[7]: raise IPyradError("hi") # In[4]: self = data derep_sort_map(s3.data, samples[0], s3.force) # In[11]: ra.exception() # In[ ]: data.get_params()