#!/usr/bin/env python # coding: utf-8 # In[1]: import ipyrad as ip import ipyparallel as ipp # In[2]: from ipyrad.assemble.clustmap import * from ipyrad.assemble.consens_se import * # In[3]: ipyclient = ipp.Client() # In[4]: data = ip.Assembly("test") data.set_params("project_dir", "hotfix") data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz") data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt") data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa") data.set_params("assembly_method", "reference") data.set_params("datatype", "pairddrad") data.set_params("restriction_overhang", ("TGCAG", "CCG")) data.get_params() # In[5]: data.run("12", force=True) # In[7]: data.run("3") # In[8]: data.run('4') # In[9]: data.run("5") # In[11]: data.run("6") # In[51]: from ipyrad.assemble.write_outputs import * bself = Step7(data, True, ipyclient) self.split_clusters() jobs = glob.glob(os.path.join(self.data.tmpdir, "chunk-*")) jobs = sorted(jobs, key=lambda x: int(x.rsplit("-")[-1])) for jobfile in jobs: args = (self.data, self.chunksize, jobfile) # In[119]: def get_edges(self, seqs): """ Trim terminal edges or mask internal edges based on three criteria and take the max for each edge. 1. user entered hard trimming. 2. removing cutsite overhangs. 3. trimming singleton-like overhangs from seqs of diff lengths. """ # record whether to filter this locus based on sample coverage bad = False # 1. hard trim edges trim1 = np.array(self.data.paramsdict["trim_loci"]) # 2. fuzzy match for trimming restriction site where it's expected. trim2 = np.array([0, 0, 0, 0]) overhangs = np.array([ i.encode() for i in self.data.paramsdict["restriction_overhang"] ]) for pos, overhang in enumerate(overhangs): if overhang: cutter = np.array(list(overhang)) trim2[pos] = check_cutter(seqs, pos, cutter, 0.75) # 3. find where the edge is not indel marked (really unknown ("N")) trim3 = np.array([0, 0, 0, 0]) try: minsamp = min(4, seqs.shape[0]) # minsamp = max(minsamp, self.data.paramsdict["min_samples_locus"]) mincovs = np.sum((seqs != 78) & (seqs != 45), axis=0) for pos in range(4): trim3[pos] = check_minsamp(seqs, pos, minsamp, mincovs) except ValueError: print('error') bad = True # get max edges print(trim1, trim2, trim3) trim = np.max([trim1, trim2, trim3], axis=0) # return edges as slice indices r1left = trim[0] # single-end simple: if "pair" not in self.data.paramsdict["datatype"]: r1right = seqs.shape[1] - trim[1] r2left = r2right = r1right edges = (r1left, r1right, r2left, r2right) else: r1right = # get filter if (r1right < r1left) or (r2left < r1right) or (r2right < r2left): bad = True # if loc length is too short then filter if (r2right - r1left) < self.data.paramsdict["filter_min_trim_len"]: bad = True return bad, edges # In[120]: def check_minsamp(seq, position, minsamp, mincovs): "used in Processor.get_edges() for trimming edges of - or N sites." if position == 0: # find leftmost edge with minsamp coverage leftmost = np.where(mincovs >= minsamp)[0] if leftmost.size: return leftmost.min() # no sites actually have minsamp coverage although there are minsamp # rows of data, this can happen when reads only partially overlap. Loc # should be excluded for minsamp filter. else: raise ValueError("no sites above minsamp coverage in edge trim") elif position == 1: maxhit = np.where(mincovs >= minsamp)[0].max() return seq.shape[1] - (maxhit + 1) ## TODO... elif position == 2: return 0 else: return 0 # In[121]: # store list of edge trims for VCF building edgelist = [] # todo: this could be an iterator... with open(self.chunkfile, 'rb') as infile: loci = infile.read().split(b"//\n//\n") # iterate over loci for iloc, loc in enumerate(loci): # load names and seqs lines = loc.decode().strip().split("\n") names = [] nidxs = [] aseqs = [] useqs = [] for line in lines: if line[0] == ">": name, nidx = line[1:].rsplit("_", 1) names.append(name) nidxs.append(nidx) else: aseqs.append(list(line)) useqs.append(list(line.upper())) # filter to include only samples in this assembly mask = [i in self.data.snames for i in names] names = np.array(names)[mask].tolist() nidxs = np.array(nidxs)[mask].tolist() useqs = np.array(useqs)[mask, :].astype(bytes).view(np.uint8) aseqs = np.array(aseqs)[mask, :].astype(bytes).view(np.uint8) # apply filters efilter, edges = get_edges(self, useqs) print(efilter, edges) # In[54]: data = self.data chunksize = self.chunksize chunkfile = jobs[0] self = Processor(data, chunksize, chunkfile) # In[23]: self.remote_process_chunks() self.collect_stats() self.store_file_handles() # In[24]: start = time.time() printstr = ("building arrays ", "s7") rasyncs = {} args0 = (self.data,) args1 = (self.data, self.ntaxa, self.nbases, self.nloci) args2 = (self.data, self.ntaxa, self.nsnps) write_loci_and_alleles(*args0) # In[25]: fill_seq_array(*args1) # In[49]: data = self.data ntaxa = self.ntaxa nsnps = self.nsnps # get faidict to convert chroms to ints if data.isref: faidict = chroms2ints(data, True) # open new database file handle with h5py.File(data.snps_database, 'w') as io5: # Database files for storing arrays on disk. # Should optimize for slicing by rows if we run into slow writing, or # it uses too much mem. For now letting h5py to auto-chunking. io5.create_dataset( name="snps", shape=(ntaxa, nsnps), dtype=np.uint8, ) # store snp locations: # (loc-counter, loc-snp-counter, loc-snp-pos, chrom, chrom-snp-pos) io5.create_dataset( name="snpsmap", shape=(nsnps, 5), dtype=np.uint32, ) # store snp locations io5.create_dataset( name="pseudoref", shape=(nsnps, 4), dtype=np.uint8, ) # store genotype calls (0/0, 0/1, 0/2, etc.) io5.create_dataset( name="genos", shape=(nsnps, ntaxa, 2), dtype=np.uint8, ) # gather all loci bits locibits = glob.glob(os.path.join(data.tmpdir, "*.loci")) sortbits = sorted(locibits, key=lambda x: int(x.rsplit("-", 1)[-1][:-5])) # name order for entry in array sidxs = {sample: i for (i, sample) in enumerate(data.snames)} # iterate through file start = end = 0 tmploc = {} locidx = 1 snpidx = 1 # array to store until writing tmparr = np.zeros((ntaxa, nsnps), dtype=np.uint8) tmpmap = np.zeros((nsnps, 5), dtype=np.uint32) # iterate over chunkfiles for bit in sortbits: # iterate lines of file until locus endings for line in iter(open(bit, 'r')): # still filling locus until |\n if "|\n" not in line: name, seq = line.split() tmploc[name] = seq # locus is full, dump it else: # convert seqs to an array loc = ( np.array([list(i) for i in tmploc.values()]) .astype(bytes).view(np.uint8) ) snps, idxs, _ = line[len(data.snppad):].rsplit("|", 2) snpsmask = np.array(list(snps)) != " " snpsidx = np.where(snpsmask)[0] # select only the SNP sites snpsites = loc[:, snpsmask] # store end position of locus for map end = start + snpsites.shape[1] print(start, end) for idx, name in enumerate(tmploc): tmparr[sidxs[name], start:end] = snpsites[idx, :] # store snpsmap data 1-indexed with chroms info if data.isref: chrom, pos = idxs.split(",")[0].split(":") start = int(pos.split("-")[0]) #chromidx = faidict[chrom] chromidx = int(chrom) for isnp in range(snpsites.shape[1]): isnpx = snpsidx[isnp] tmpmap[snpidx - 1] = ( locidx, isnp, isnpx, chromidx, isnpx + start, ) snpidx += 1 # store snpsmap data (snpidx is 1-indexed) else: for isnp in range(snpsites.shape[1]): tmpmap[snpidx - 1] = ( locidx, isnp, snpsidx[isnp], 0, snpidx, ) snpidx += 1 locidx += 1 # reset locus start = end tmploc = {} # fill missing with 78 (N) tmparr[tmparr == 0] = 78 # In[48]: print(tmparr.shape) print(start, end) tmparr[sidxs[name], start:end] # In[21]: fill_snp_array(*args2) # In[12]: data.run("7") # In[ ]: # In[ ]: # In[5]: data.run("3") # In[6]: data.run("456") # In[11]: data.run("1", force=True) # In[9]: data = ip.load_json("tortas/5-tortas.json") # In[18]: d2 = data.branch("d2", subsamples=["AGO09concat"]) # In[19]: d2.run("3", force=True) # In[20]: d2.run("45", force=True) # In[7]: d2 = ip.load_json("./tortas/d2.json") # In[9]: d2.stats # In[11]: self = Step5(data, True, ipyclient) # In[12]: self.remote_calculate_depths() # In[13]: self.remote_make_chunks() # In[14]: statsdicts = self.remote_process_chunks() # In[15]: statsdicts # In[ ]: self.remote_concatenate_chunks() # In[ ]: self.data_store(statsdicts) # In[10]: data.stats # In[5]: dd = data.branch("dd") dd.run("5", force=True) # In[6]: dd.run("3", force=True) # In[4]: data.run("5", force=True) # In[5]: step = Step6(data, True, ipyclient) # In[6]: step.remote_concat_bams() # In[7]: step.remote_build_ref_regions() # In[8]: self = step # In[9]: regs = self.regions[:20] regs # In[11]: # access reads from bam file using pysam bamfile = AlignmentFile( os.path.join( self.data.dirs.across, "cat.sorted.bam"), 'rb') # catcons output file for raw clusters and db samples outfile = gzip.open( os.path.join( self.data.dirs.across, "{}.catcons.gz".format(self.data.name)), 'wb') # write header line to catcons with sample names snames = sorted([i.name for i in self.samples]) nsamples = len(snames) outfile.write( b" ".join([i.encode() for i in snames]) + b"\n") # get clusters lidx = 0 clusts = [] # while 1: # try: # region = next(self.regions) # reads = bamfile.fetch(*region) # except StopIteration: # break for region in regs: reads = bamfile.fetch(*region) # get reference print(region) refn, refs = get_ref_region( data.paramsdict["reference_sequence"], region[0], region[1]+1, region[2]+1) # build cluster dict for sorting rdict = {} for read in reads: rdict[read.qname] = read.seq keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0]) # build cluster based on map positions (reads are already oriented) arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes) # fill it arr[0] = list(refs) for idx, key in enumerate(keys): # get start and stop from this seq sname = key.rsplit("_", 1)[0] rstart, rstop = key.rsplit(":", 2)[-1].split("-") sidx = snames.index(sname) # get start relative to ref start = int(rstart) - int(region[1]) - 1 stop = start + int(rstop) - int(rstart) print(sidx + 1, start, stop, arr.shape[1]) arr[sidx + 1, int(start): int(stop)] = list(rdict[key]) print("") arr[arr == b""] = b"N" for line in arr: outfile.write(line.tostring() + b"\n") outfile.write(b"\n") outfile.close() # In[50]: print(start, stop, stop-start, len(rdict[key]), rstart, rstop, int(rstop) - int(rstart)) print(region, region[2] - region[1], len(refs)) # In[61]: key.split("") # In[62]: snames # In[141]: revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN") # In[37]: # get consens seq and variant site index clust = [] avars = refvars(arr.view(np.uint8), PSEUDO_REF) dat = b"".join(avars.view("S1")[:, 0]).decode() snpstring = "".join(["*" if i else " " for i in avars[:, 1]]) clust.append("ref_{}:{}-{}\n{}".format(*region, dat)) # or, just build variant string (or don't...) # write all loci with [locids, nsnps, npis, nindels, ?] for key in keys: clust.append("{}\n{}".format(key, rdict[key])) clust.append("SNPs\n" + snpstring) clusts.append("\n".join(clust)) # advance locus counter lidx += 1 # write chunks to file if not lidx % 1000: outfile.write( str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n")) clusts = [] # write remaining if clusts: outfile.write( str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n")) outfile.close() # In[16]: step.remote_build_ref_clusters() # In[4]: from ipyrad.assemble.clustmap import * # In[6]: self = Step3(data, 8, True, ipyclient) # In[7]: self.run() # In[8]: self.data.run("45") # In[13]: self.remote_index_refs() self.remote_run( function=concat_multiple_edits, printstr=("concatenating ", "s3"), args=(), ) self.remote_run( function=merge_end_to_end, printstr=("join unmerged pairs ", "s3"), args=(False, False,), ) # In[15]: self.remote_run( function=dereplicate, printstr=("dereplicating ", "s3"), args=(self.nthreads,), threaded=True, ) # In[16]: self.remote_run( function=split_endtoend_reads, printstr=("splitting dereps ", "s3"), args=(), ) # In[ ]: self.remote_run( function=mapping_reads, printstr=("mapping reads ", "s3"), args=(self.nthreads,), threaded=True, ) # In[ ]: sample = list(self.data.samples.values())[0] merge_end_to_end(self.data, sample, True, True) # In[ ]: # In[15]: sample = list(data.samples.values())[0] infiles = [ os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)), os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)), os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)), os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)), ] infiles = [i for i in infiles if os.path.exists(i)] infile = infiles[-1] infile # In[19]: strand = "plus" if data.paramsdict["datatype"] is ('gbs' or '2brad'): strand = "both" nthreads=2 cmd = [ ip.bins.vsearch, "--derep_fulllength", infile, "--strand", strand, "--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"), "--threads", str(nthreads), "--fasta_width", str(0), "--fastq_qmax", "1000", "--sizeout", "--relabel_md5", ] cmd # In[20]: proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True) errmsg = proc.communicate()[0] if proc.returncode: ip.logger.error("error inside dereplicate %s", errmsg) raise IPyradWarningExit(errmsg) # In[10]: s.remote_run( function=dereplicate, printstr=("dereplicating ", "s3"), args=(s.nthreads,), threaded=True, )