import ipyrad as ip
import ipyparallel as ipp
from ipyrad.assemble.clustmap import *
from ipyrad.assemble.consens_se import *
ipyclient = ipp.Client()
/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:459: RuntimeWarning: Controller appears to be listening on localhost, but not on this machine. If this is true, you should specify Client(...,sshserver='you@oud') or instruct your controller to listen on an external IP. RuntimeWarning)
data = ip.Assembly("test")
data.set_params("project_dir", "hotfix")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("assembly_method", "reference")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CCG"))
data.get_params()
New Assembly: test 0 assembly_name test 1 project_dir ./hotfix 2 raw_fastq_path ./ipsimdata/pairddrad_example_R*_.fastq.gz 3 barcodes_path ./ipsimdata/pairddrad_example_barcodes.txt 4 sorted_fastq_path 5 assembly_method reference 6 reference_sequence ./ipsimdata/pairddrad_example_genome.fa 7 datatype pairddrad 8 restriction_overhang ('TGCAG', 'CCG') 9 max_low_qual_bases 5 10 phred_Qscore_offset 33 11 mindepth_statistical 6 12 mindepth_majrule 6 13 maxdepth 10000 14 clust_threshold 0.85 15 max_barcode_mismatch 0 16 filter_adapters 0 17 filter_min_trim_len 35 18 max_alleles_consens 2 19 max_Ns_consens (5, 5) 20 max_Hs_consens (8, 8) 21 min_samples_locus 4 22 max_SNPs_locus (20, 20) 23 max_Indels_locus (8, 8) 24 max_shared_Hs_locus 0.5 25 trim_reads (0, 0, 0, 0) 26 trim_loci (0, 0, 0, 0) 27 output_formats ['p', 's', 'v'] 28 pop_assign_file
data.run("12", force=True)
Assembly: test [####################] 100% 0:00:04 | sorting reads | s1 | [####################] 100% 0:00:01 | writing/compressing | s1 | [####################] 100% 0:00:04 | processing reads | s2 |
data.run("3")
Assembly: test [####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:00:01 | join unmerged pairs | s3 | [####################] 100% 0:00:03 | dereplicating | s3 | [####################] 100% 0:00:00 | splitting dereps | s3 | [####################] 100% 0:00:03 | mapping reads | s3 | [####################] 100% 0:00:10 | building clusters | s3 | [####################] 100% 0:00:00 | calc cluster stats | s3 |
data.run('4')
Assembly: test [####################] 100% 0:00:06 | inferring [H, E] | s4 |
data.run("5")
Assembly: test [####################] 100% 0:00:00 | calculating depths | s5 | [####################] 100% 0:00:00 | chunking clusters | s5 | [####################] 100% 0:00:27 | consens calling | s5 | [####################] 100% 0:00:01 | indexing alleles | s5 |
data.run("6")
Assembly: test [####################] 100% 0:00:00 | concatenating bams | s6 | [####################] 100% 0:00:00 | fetching regions | s6 | [####################] 100% 0:00:00 | building loci | s6 |
from ipyrad.assemble.write_outputs import *
bself = Step7(data, True, ipyclient)
self.split_clusters()
jobs = glob.glob(os.path.join(self.data.tmpdir, "chunk-*"))
jobs = sorted(jobs, key=lambda x: int(x.rsplit("-")[-1]))
for jobfile in jobs:
args = (self.data, self.chunksize, jobfile)
def get_edges(self, seqs):
"""
Trim terminal edges or mask internal edges based on three criteria and
take the max for each edge.
1. user entered hard trimming.
2. removing cutsite overhangs.
3. trimming singleton-like overhangs from seqs of diff lengths.
"""
# record whether to filter this locus based on sample coverage
bad = False
# 1. hard trim edges
trim1 = np.array(self.data.paramsdict["trim_loci"])
# 2. fuzzy match for trimming restriction site where it's expected.
trim2 = np.array([0, 0, 0, 0])
overhangs = np.array([
i.encode() for i in self.data.paramsdict["restriction_overhang"]
])
for pos, overhang in enumerate(overhangs):
if overhang:
cutter = np.array(list(overhang))
trim2[pos] = check_cutter(seqs, pos, cutter, 0.75)
# 3. find where the edge is not indel marked (really unknown ("N"))
trim3 = np.array([0, 0, 0, 0])
try:
minsamp = min(4, seqs.shape[0])
# minsamp = max(minsamp, self.data.paramsdict["min_samples_locus"])
mincovs = np.sum((seqs != 78) & (seqs != 45), axis=0)
for pos in range(4):
trim3[pos] = check_minsamp(seqs, pos, minsamp, mincovs)
except ValueError:
print('error')
bad = True
# get max edges
print(trim1, trim2, trim3)
trim = np.max([trim1, trim2, trim3], axis=0)
# return edges as slice indices
r1left = trim[0]
# single-end simple:
if "pair" not in self.data.paramsdict["datatype"]:
r1right = seqs.shape[1] - trim[1]
r2left = r2right = r1right
edges = (r1left, r1right, r2left, r2right)
else:
r1right =
# get filter
if (r1right < r1left) or (r2left < r1right) or (r2right < r2left):
bad = True
# if loc length is too short then filter
if (r2right - r1left) < self.data.paramsdict["filter_min_trim_len"]:
bad = True
return bad, edges
def check_minsamp(seq, position, minsamp, mincovs):
"used in Processor.get_edges() for trimming edges of - or N sites."
if position == 0:
# find leftmost edge with minsamp coverage
leftmost = np.where(mincovs >= minsamp)[0]
if leftmost.size:
return leftmost.min()
# no sites actually have minsamp coverage although there are minsamp
# rows of data, this can happen when reads only partially overlap. Loc
# should be excluded for minsamp filter.
else:
raise ValueError("no sites above minsamp coverage in edge trim")
elif position == 1:
maxhit = np.where(mincovs >= minsamp)[0].max()
return seq.shape[1] - (maxhit + 1)
## TODO...
elif position == 2:
return 0
else:
return 0
# store list of edge trims for VCF building
edgelist = []
# todo: this could be an iterator...
with open(self.chunkfile, 'rb') as infile:
loci = infile.read().split(b"//\n//\n")
# iterate over loci
for iloc, loc in enumerate(loci):
# load names and seqs
lines = loc.decode().strip().split("\n")
names = []
nidxs = []
aseqs = []
useqs = []
for line in lines:
if line[0] == ">":
name, nidx = line[1:].rsplit("_", 1)
names.append(name)
nidxs.append(nidx)
else:
aseqs.append(list(line))
useqs.append(list(line.upper()))
# filter to include only samples in this assembly
mask = [i in self.data.snames for i in names]
names = np.array(names)[mask].tolist()
nidxs = np.array(nidxs)[mask].tolist()
useqs = np.array(useqs)[mask, :].astype(bytes).view(np.uint8)
aseqs = np.array(aseqs)[mask, :].astype(bytes).view(np.uint8)
# apply filters
efilter, edges = get_edges(self, useqs)
print(efilter, edges)
[0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 456, 456, 456) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 471, 471, 471) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 466, 466, 466) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 478, 478, 478) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 464, 464, 464) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 459, 459, 459) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 484, 484, 484) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 441, 441, 441) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 470, 470, 470) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 486, 486, 486) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 442, 442, 442) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 465, 465, 465) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 478, 478, 478) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 453, 453, 453) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 453, 453, 453) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 466, 466, 466) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 473, 473, 473) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 441, 441, 441) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 438, 438, 438) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 464, 464, 464) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 482, 482, 482) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 443, 443, 443) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 469, 469, 469) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 456, 456, 456) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 480, 480, 480) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 463, 463, 463) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 478, 478, 478) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 438, 438, 438) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 442, 442, 442) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 443, 443, 443) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 454, 454, 454) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 467, 467, 467) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 451, 451, 451) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 472, 472, 472) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 476, 476, 476) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 439, 439, 439) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 457, 457, 457) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 453, 453, 453) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 475, 475, 475) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 448, 448, 448) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 440, 440, 440) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 460, 460, 460) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 469, 469, 469) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 473, 473, 473) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 444, 444, 444) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 442, 442, 442) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 449, 449, 449) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 484, 484, 484) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 486, 486, 486) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 478, 478, 478) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 467, 467, 467) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 479, 479, 479) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 476, 476, 476) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 446, 446, 446) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 452, 452, 452) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 478, 478, 478) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 450, 450, 450) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 458, 458, 458) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 453, 453, 453) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 452, 452, 452) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 451, 451, 451) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 459, 459, 459) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 443, 443, 443) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 484, 484, 484) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 453, 453, 453) [0 0 0 0] [5 3 0 0] [0 0 0 0] False (5, 442, 442, 442)
data = self.data
chunksize = self.chunksize
chunkfile = jobs[0]
self = Processor(data, chunksize, chunkfile)
self.remote_process_chunks()
self.collect_stats()
self.store_file_handles()
[####################] 100% 0:00:01 | applying filters | s7 |
start = time.time()
printstr = ("building arrays ", "s7")
rasyncs = {}
args0 = (self.data,)
args1 = (self.data, self.ntaxa, self.nbases, self.nloci)
args2 = (self.data, self.ntaxa, self.nsnps)
write_loci_and_alleles(*args0)
fill_seq_array(*args1)
data = self.data
ntaxa = self.ntaxa
nsnps = self.nsnps
# get faidict to convert chroms to ints
if data.isref:
faidict = chroms2ints(data, True)
# open new database file handle
with h5py.File(data.snps_database, 'w') as io5:
# Database files for storing arrays on disk.
# Should optimize for slicing by rows if we run into slow writing, or
# it uses too much mem. For now letting h5py to auto-chunking.
io5.create_dataset(
name="snps",
shape=(ntaxa, nsnps),
dtype=np.uint8,
)
# store snp locations:
# (loc-counter, loc-snp-counter, loc-snp-pos, chrom, chrom-snp-pos)
io5.create_dataset(
name="snpsmap",
shape=(nsnps, 5),
dtype=np.uint32,
)
# store snp locations
io5.create_dataset(
name="pseudoref",
shape=(nsnps, 4),
dtype=np.uint8,
)
# store genotype calls (0/0, 0/1, 0/2, etc.)
io5.create_dataset(
name="genos",
shape=(nsnps, ntaxa, 2),
dtype=np.uint8,
)
# gather all loci bits
locibits = glob.glob(os.path.join(data.tmpdir, "*.loci"))
sortbits = sorted(locibits,
key=lambda x: int(x.rsplit("-", 1)[-1][:-5]))
# name order for entry in array
sidxs = {sample: i for (i, sample) in enumerate(data.snames)}
# iterate through file
start = end = 0
tmploc = {}
locidx = 1
snpidx = 1
# array to store until writing
tmparr = np.zeros((ntaxa, nsnps), dtype=np.uint8)
tmpmap = np.zeros((nsnps, 5), dtype=np.uint32)
# iterate over chunkfiles
for bit in sortbits:
# iterate lines of file until locus endings
for line in iter(open(bit, 'r')):
# still filling locus until |\n
if "|\n" not in line:
name, seq = line.split()
tmploc[name] = seq
# locus is full, dump it
else:
# convert seqs to an array
loc = (
np.array([list(i) for i in tmploc.values()])
.astype(bytes).view(np.uint8)
)
snps, idxs, _ = line[len(data.snppad):].rsplit("|", 2)
snpsmask = np.array(list(snps)) != " "
snpsidx = np.where(snpsmask)[0]
# select only the SNP sites
snpsites = loc[:, snpsmask]
# store end position of locus for map
end = start + snpsites.shape[1]
print(start, end)
for idx, name in enumerate(tmploc):
tmparr[sidxs[name], start:end] = snpsites[idx, :]
# store snpsmap data 1-indexed with chroms info
if data.isref:
chrom, pos = idxs.split(",")[0].split(":")
start = int(pos.split("-")[0])
#chromidx = faidict[chrom]
chromidx = int(chrom)
for isnp in range(snpsites.shape[1]):
isnpx = snpsidx[isnp]
tmpmap[snpidx - 1] = (
locidx, isnp, isnpx, chromidx, isnpx + start,
)
snpidx += 1
# store snpsmap data (snpidx is 1-indexed)
else:
for isnp in range(snpsites.shape[1]):
tmpmap[snpidx - 1] = (
locidx, isnp, snpsidx[isnp], 0, snpidx,
)
snpidx += 1
locidx += 1
# reset locus
start = end
tmploc = {}
# fill missing with 78 (N)
tmparr[tmparr == 0] = 78
0 6 6 16 16 28 28 37 37 46 46 51 51 61 61 74 74 83 83 89 89 102 102 112 112 121 121 128 128 134 134 142 142 149 149 158 158 163 163 172 172 187 187 201 201 208 208 217 217 226 226 239 239 246 246 257 257 265 265 273 273 277 277 290 290 297 297 301 301 311 311 320 320 330 330 340 340 352 352 361 361 369 369 381 381 391 391 404 404 413 413 419 419 432 432 442 442 458 458 465 465 482 482 487 487 492 492 499 499 511 511 517 517 530 530 534 534 543 543 549 549 562 562 574 574 585 585 597 597 612 612 620 620 627 627 635 635 645 645 661 661 667 667 683 683 691 691 697 697 710 710 721 721 737 737 745 745 751 751 771 771 779 779 785 785 792 792 809 809 819 819 831 831 838 838 848 848 863 863 871 871 881 881 892 892 895 895 903 903 921 921 927 927 934 934 946 946 954 954 964 964 975 975 979 979 989 989 996 996 1004 1004 1014 1014 1022 1022 1031 1031 1041 1041 1050 1050 1062 1062 1066 1066 1079 1079 1090 1090 1099 1099 1112 1112 1129 1129 1140 1140 1153 1153 1162 1162 1168 1168 1180 1180 1186 1186 1193 1193 1207 1207 1212 1212 1219 1219 1230 1230 1242 1242 1254 1254 1268 1268 1277 1277 1289 1289 1298 1298 1305 1305 1317 1317 1328 1328 1338 1338 1346 1346 1353 1353 1363 1363 1372 1372 1380 1380 1397 1397 1404 1404 1419 1419 1430 1430 1437 1437 1447 1447 1459 1459 1471 1471 1478 1478 1487 1487 1492 1492 1501 1501 1512 1512 1517 1517 1525 1525 1541 1541 1555 1555 1561 1561 1577 1577 1582 1582 1595 1595 1606 1606 1612 1612 1622 1622 1635 1635 1641 1641 1651 1651 1659 1659 1662 1662 1674 1674 1684 1684 1696 1696 1703 1703 1714 1714 1721 1721 1731 1731 1737 1737 1743 1743 1750 1750 1763 1763 1775 1775 1787 1787 1794 1794 1805 1805 1812 1812 1821 1821 1827 1827 1833 1833 1847 1847 1856 1856 1867 1867 1876 1876 1886 1886 1895 1895 1904 1904 1909 1909 1922 1922 1932 1932 1943 1943 1953 1953 1963 1963 1971 1971 1984 1984 1994 1994 1997 1997 2005 2005 2014 2014 2023 2023 2042 2042 2051 2051 2063 2063 2074 2074 2082 2082 2091 2091 2100 2100 2106 2106 2119 2119 2126 2126 2138 2138 2144 2144 2155 2155 2162 2162 2171 2171 2174 2174 2183 2183 2193 2193 2205 2205 2215 2215 2225 2225 2232 2232 2242 2242 2252 2252 2256 2256 2265 2265 2274 2274 2286 2286 2298 2298 2304 2304 2321 2321 2334 2334 2342 2342 2352 2352 2357 2357 2365 2365 2375 2375 2385 2385 2389 2389 2396 2396 2400 2400 2411 2411 2419 2419 2426 2426 2440 2440 2448 2448 2460 2460 2467 2467 2479
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-49-3630a9f0177d> in <module>() 85 86 for idx, name in enumerate(tmploc): ---> 87 tmparr[sidxs[name], start:end] = snpsites[idx, :] 88 89 # store snpsmap data 1-indexed with chroms info ValueError: could not broadcast input array from shape (12) into shape (7)
print(tmparr.shape)
print(start, end)
tmparr[sidxs[name], start:end]
(13, 2474) 2467 2479
array([0, 0, 0, 0, 0, 0, 0], dtype=uint8)
fill_snp_array(*args2)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-21-f157821ffeb6> in <module>() ----> 1 fill_snp_array(*args2) ~/Documents/ipyrad/ipyrad/assemble/write_outputs.py in fill_snp_array(data, ntaxa, nsnps) 1585 end = start + snpsites.shape[1] 1586 for idx, name in enumerate(tmploc): -> 1587 tmparr[sidxs[name], start:end] = snpsites[idx, :] 1588 1589 # store snpsmap data 1-indexed with chroms info ValueError: could not broadcast input array from shape (12) into shape (7)
data.run("7")
Assembly: test [####################] 100% 0:00:05 | applying filters | s7 | [####################] 100% 0:00:01 | building arrays | s7 | Encountered an unexpected error: ValueError(could not broadcast input array from shape (12) into shape (7))
---------------------------------------------------------------------------ValueError Traceback (most recent call last)<string> in <module>() ~/Documents/ipyrad/ipyrad/assemble/write_outputs.py in fill_snp_array(data, ntaxa, nsnps) 1585 end = start + snpsites.shape[1] 1586 for idx, name in enumerate(tmploc): -> 1587 tmparr[sidxs[name], start:end] = snpsites[idx, :] 1588 1589 # store snpsmap data 1-indexed with chroms info ValueError: could not broadcast input array from shape (12) into shape (7)
data.run("3")
Assembly: test [####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:00:01 | join unmerged pairs | s3 | [####################] 100% 0:00:00 | dereplicating | s3 | [####################] 100% 0:00:00 | splitting dereps | s3 | [####################] 100% 0:00:02 | mapping reads | s3 | [####################] 100% 0:00:09 | building clusters | s3 | [####################] 100% 0:00:00 | calc cluster stats | s3 |
data.run("456")
Assembly: test [####################] 100% 0:00:02 | inferring [H, E] | s4 | [####################] 100% 0:00:00 | calculating depths | s5 | [####################] 100% 0:00:00 | chunking clusters | s5 | [####################] 100% 0:00:24 | consens calling | s5 | [####################] 100% 0:00:01 | indexing alleles | s5 | [####################] 100% 0:00:00 | concatenating bams | s6 | [####################] 100% 0:00:00 | building clusters | s6 | Encountered an unexpected error (see ./ipyrad_log.txt) Error message is below ------------------------------- name 'data' is not defined
data.run("1", force=True)
Assembly: test [force] overwriting fastq files previously created by ipyrad. This _does not_ affect your original/raw data files. [####################] 100% 0:00:03 | sorting reads | s1 | [####################] 100% 0:00:00 | writing/compressing | s1 |
data = ip.load_json("tortas/5-tortas.json")
loading Assembly: 5-tortas from saved path: ~/Documents/ipyrad/tests/tortas/5-tortas.json
d2 = data.branch("d2", subsamples=["AGO09concat"])
d2.run("3", force=True)
Assembly: d2 [####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:02:44 | join unmerged pairs | s3 | [####################] 100% 0:01:14 | dereplicating | s3 | [####################] 100% 0:00:18 | splitting dereps | s3 | [####################] 100% 0:52:29 | mapping reads | s3 | [####################] 100% 0:55:43 | building clusters | s3 | [####################] 100% 0:00:13 | calc cluster stats | s3 |
d2.run("45", force=True)
Assembly: d2 [####################] 100% 0:00:21 | inferring [H, E] | s4 | [####################] 100% 0:00:13 | calculating depths | s5 | [####################] 100% 0:00:22 | chunking clusters | s5 | [####################] 100% 15:27:41 | consens calling | s5 | [####################] 100% 0:00:41 | indexing alleles | s5 |
d2 = ip.load_json("./tortas/d2.json")
loading Assembly: d2 from saved path: ~/Documents/ipyrad/tests/tortas/d2.json
d2.stats
state | reads_raw | reads_passed_filter | refseq_mapped_reads | refseq_unmapped_reads | clusters_total | clusters_hidepth | hetero_est | error_est | reads_consens | |
---|---|---|---|---|---|---|---|---|---|---|
AGO09concat | 5 | 15650127 | 15121047 | 8307851 | 6813196 | 1364279 | 381081 | 0.001636 | 0.001511 | 380396 |
self = Step5(data, True, ipyclient)
self.remote_calculate_depths()
[####################] 100% 0:00:17 | calculating depths | s5 |
self.remote_make_chunks()
[####################] 100% 0:00:28 | chunking clusters | s5 |
statsdicts = self.remote_process_chunks()
[####################] 100% 1:07:30 | consens calling | s5 |
statsdicts
{'AGO08concat': [({'name': 90642, 'heteros': 15544, 'nsites': 22873090, 'nconsens': 90642}, {'depth': 227971, 'maxh': 30, 'maxn': 2}), ({'name': 406474, 'heteros': 17080, 'nsites': 22034591, 'nconsens': 87829}, {'depth': 230745, 'maxh': 69, 'maxn': 2}), ({'name': 725373, 'heteros': 17170, 'nsites': 22095933, 'nconsens': 88083}, {'depth': 230497, 'maxh': 62, 'maxn': 3}), ({'name': 1044454, 'heteros': 28259, 'nsites': 22420201, 'nconsens': 88519}, {'depth': 229823, 'maxh': 301, 'maxn': 2})], 'AGO11concat': [({'name': 88491, 'heteros': 15532, 'nsites': 17265948, 'nconsens': 88491}, {'depth': 144187, 'maxh': 31, 'maxn': 0}), ({'name': 319207, 'heteros': 16819, 'nsites': 16849885, 'nconsens': 86498}, {'depth': 146149, 'maxh': 62, 'maxn': 0}), ({'name': 552196, 'heteros': 16745, 'nsites': 16895490, 'nconsens': 86778}, {'depth': 145870, 'maxh': 61, 'maxn': 0}), ({'name': 785074, 'heteros': 26253, 'nsites': 17128785, 'nconsens': 86947}, {'depth': 145382, 'maxh': 374, 'maxn': 0})], 'AGO02concat': [({'name': 79397, 'heteros': 14200, 'nsites': 16889465, 'nconsens': 79397}, {'depth': 189044, 'maxh': 24, 'maxn': 0}), ({'name': 345488, 'heteros': 15332, 'nsites': 16378730, 'nconsens': 77023}, {'depth': 191394, 'maxh': 48, 'maxn': 0}), ({'name': 614298, 'heteros': 15677, 'nsites': 16486755, 'nconsens': 77368}, {'depth': 191048, 'maxh': 49, 'maxn': 0}), ({'name': 883384, 'heteros': 24439, 'nsites': 16840082, 'nconsens': 77989}, {'depth': 190103, 'maxh': 370, 'maxn': 0})], 'AGO09concat': [({'name': 1, 'heteros': 0, 'nsites': 147, 'nconsens': 1}, {'depth': 9, 'maxh': 0, 'maxn': 0})]}
self.remote_concatenate_chunks()
self.data_store(statsdicts)
data.stats
state | reads_raw | reads_passed_filter | refseq_mapped_reads | refseq_unmapped_reads | clusters_total | clusters_hidepth | hetero_est | error_est | reads_consens | |
---|---|---|---|---|---|---|---|---|---|---|
AGO02concat | 5 | 11050294 | 10800672 | 7210402 | 3590270 | 1073857 | 312271 | 0.002455 | 0.001599 | 310509 |
AGO08concat | 5 | 13408401 | 13030329 | 7245593 | 5784736 | 1274580 | 355547 | 0.002825 | 0.001497 | 353139 |
AGO09concat | 5 | 15650127 | 15121047 | 8307851 | 6813196 | 1364279 | 1 | 0.002711 | 0.001882 | 378504 |
AGO11concat | 5 | 12848936 | 12370018 | 7855116 | 4514902 | 930830 | 349245 | 0.002108 | 0.001574 | 347812 |
dd = data.branch("dd")
dd.run("5", force=True)
Assembly: dd [####################] 100% 0:00:17 | calculating depths | s5 | [####################] 100% 0:00:31 | chunking clusters | s5 | [####################] 100% 1 day, 1:41:43 | consens calling | s5 | [####################] 100% 0:00:18 | indexing alleles | s5 | Encountered an error (see details in ./ipyrad_log.txt) Error summary is below ------------------------------- KeyboardInterrupt()
dd.run("3", force=True)
state | reads_raw | reads_passed_filter | refseq_mapped_reads | refseq_unmapped_reads | clusters_total | clusters_hidepth | hetero_est | error_est | reads_consens | |
---|---|---|---|---|---|---|---|---|---|---|
AGO02concat | 4 | 11050294 | 10800672 | 7210402 | 3590270 | 1073857 | 312271 | 0.001542 | 0.001308 | 310509 |
AGO08concat | 4 | 13408401 | 13030329 | 7245593 | 5784736 | 1274580 | 355547 | 0.001910 | 0.001097 | 353139 |
AGO09concat | 4 | 15650127 | 15121047 | 8307851 | 6813196 | 1364279 | 1 | 0.000042 | 0.002260 | 378504 |
AGO11concat | 4 | 12848936 | 12370018 | 7855116 | 4514902 | 930830 | 349245 | 0.001330 | 0.001307 | 347812 |
data.run("5", force=True)
Assembly: 5-tortas [####################] 100% 0:00:16 | calculating depths | s5 | [####################] 100% 0:00:29 | chunking clusters | s5 | [####################] 100% 1:04:59 | consens calling | s5 | [####################] 100% 0:00:59 | indexing alleles | s5 | Encountered an error (see details in ./ipyrad_log.txt) Error summary is below ------------------------------- IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\n[W::sam_read1] parse error at line 10786\n[main_samview] truncated file.\n')
step = Step6(data, True, ipyclient)
step.remote_concat_bams()
[####################] 100% 0:00:00 | concatenating bams | s6 |
step.remote_build_ref_regions()
[####################] 100% 0:00:00 | building clusters | s6 |
self = step
regs = self.regions[:20]
regs
[('MT', 5008, 5467), ('MT', 10476, 10950), ('MT', 15959, 16428), ('MT', 21437, 21918), ('MT', 26927, 27394), ('MT', 32403, 32865), ('MT', 37874, 38361), ('MT', 43370, 43814), ('MT', 48823, 49296), ('MT', 54305, 54794), ('MT', 59803, 60248), ('MT', 65257, 65725), ('MT', 70734, 71215), ('MT', 76224, 76680), ('MT', 81689, 82145), ('MT', 87154, 87623), ('MT', 92632, 93108), ('MT', 98117, 98561), ('MT', 103570, 104011), ('MT', 109020, 109487)]
# access reads from bam file using pysam
bamfile = AlignmentFile(
os.path.join(
self.data.dirs.across,
"cat.sorted.bam"),
'rb')
# catcons output file for raw clusters and db samples
outfile = gzip.open(
os.path.join(
self.data.dirs.across,
"{}.catcons.gz".format(self.data.name)),
'wb')
# write header line to catcons with sample names
snames = sorted([i.name for i in self.samples])
nsamples = len(snames)
outfile.write(
b" ".join([i.encode() for i in snames]) + b"\n")
# get clusters
lidx = 0
clusts = []
# while 1:
# try:
# region = next(self.regions)
# reads = bamfile.fetch(*region)
# except StopIteration:
# break
for region in regs:
reads = bamfile.fetch(*region)
# get reference
print(region)
refn, refs = get_ref_region(
data.paramsdict["reference_sequence"],
region[0], region[1]+1, region[2]+1)
# build cluster dict for sorting
rdict = {}
for read in reads:
rdict[read.qname] = read.seq
keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
# build cluster based on map positions (reads are already oriented)
arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)
# fill it
arr[0] = list(refs)
for idx, key in enumerate(keys):
# get start and stop from this seq
sname = key.rsplit("_", 1)[0]
rstart, rstop = key.rsplit(":", 2)[-1].split("-")
sidx = snames.index(sname)
# get start relative to ref
start = int(rstart) - int(region[1]) - 1
stop = start + int(rstop) - int(rstart)
print(sidx + 1, start, stop, arr.shape[1])
arr[sidx + 1, int(start): int(stop)] = list(rdict[key])
print("")
arr[arr == b""] = b"N"
for line in arr:
outfile.write(line.tostring() + b"\n")
outfile.write(b"\n")
outfile.close()
('MT', 5008, 5467) 4 0 459 459 5 0 459 459 7 0 459 459 8 0 459 459 11 0 459 459 ('MT', 10476, 10950) 1 0 474 474 3 0 474 474 5 0 474 474 7 0 474 474 8 0 474 474 11 0 474 474 ('MT', 15959, 16428) 1 0 469 469 4 0 469 469 5 0 469 469 7 0 469 469 8 0 469 469 9 0 469 469 11 0 469 469 ('MT', 21437, 21918) 1 0 481 481 2 0 481 481 3 0 481 481 5 0 481 481 6 0 481 481 7 0 481 481 8 0 481 481 11 0 481 481 12 0 481 481 ('MT', 26927, 27394) 1 0 467 467 2 0 467 467 3 0 467 467 4 0 467 467 6 0 467 467 7 0 467 467 8 0 467 467 10 0 467 467 11 0 467 467 12 0 467 467 ('MT', 32403, 32865) 2 0 462 462 3 0 462 462 4 0 462 462 5 0 462 462 6 0 462 462 7 0 462 462 8 0 462 462 9 0 462 462 10 0 462 462 11 0 462 462 12 0 462 462 ('MT', 37874, 38361) 3 0 487 487 4 0 487 487 5 0 487 487 7 0 487 487 9 0 487 487 10 0 487 487 ('MT', 43370, 43814) 1 0 444 444 2 0 444 444 3 0 444 444 4 0 444 444 6 0 444 444 7 0 444 444 8 0 444 444 9 0 444 444 11 0 444 444 ('MT', 48823, 49296) 1 0 473 473 2 0 473 473 3 0 473 473 4 0 473 473 5 0 473 473 6 0 473 473 7 0 473 473 9 0 473 473 10 0 473 473 11 0 473 473 ('MT', 54305, 54794) 1 0 489 489 2 0 489 489 3 0 489 489 4 0 489 489 6 0 489 489 7 0 489 489 8 0 489 489 9 0 489 489 10 0 489 489 12 0 489 489 ('MT', 59803, 60248) 2 0 445 445 3 0 445 445 4 0 445 445 6 0 445 445 7 0 445 445 8 0 445 445 10 0 445 445 11 0 445 445 ('MT', 65257, 65725) 1 0 468 468 2 0 468 468 3 0 468 468 4 0 468 468 5 0 468 468 6 0 468 468 8 0 468 468 11 0 468 468 ('MT', 70734, 71215) 2 0 481 481 4 0 481 481 5 0 481 481 7 0 481 481 9 0 481 481 10 0 481 481 11 0 481 481 12 0 481 481 ('MT', 76224, 76680) 1 0 456 456 2 0 456 456 3 0 456 456 4 0 456 456 5 0 456 456 6 0 456 456 7 0 456 456 8 0 456 456 10 0 456 456 11 0 456 456 12 0 456 456 ('MT', 81689, 82145) 1 0 456 456 2 0 456 456 3 0 456 456 4 0 456 456 6 0 456 456 7 0 456 456 8 0 456 456 10 0 456 456 12 0 456 456 ('MT', 87154, 87623) 1 0 469 469 2 0 469 469 3 0 469 469 4 0 469 469 5 0 469 469 6 0 469 469 7 0 469 469 9 0 469 469 10 0 469 469 11 0 469 469 12 0 469 469 ('MT', 92632, 93108) 1 0 476 476 3 0 476 476 4 0 476 476 5 0 476 476 7 0 476 476 9 0 476 476 10 0 476 476 11 0 476 476 12 0 476 476 ('MT', 98117, 98561) 2 0 444 444 3 0 444 444 5 0 444 444 6 0 444 444 8 0 444 444 9 0 444 444 11 0 444 444 ('MT', 103570, 104011) 1 0 441 441 2 0 441 441 4 0 441 441 5 0 441 441 6 0 441 441 7 0 441 441 8 0 441 441 9 0 441 441 10 0 441 441 11 0 441 441 ('MT', 109020, 109487) 1 0 467 467 2 0 467 467 3 0 467 467 4 0 467 467 6 0 467 467 7 0 467 467 8 0 467 467 9 0 467 467 10 0 467 467 11 0 467 467 12 0 467 467
print(start, stop, stop-start, len(rdict[key]), rstart, rstop, int(rstop) - int(rstart))
print(region, region[2] - region[1], len(refs))
300 559 259 559 41920 42479 559 ('Contig0', 41619, 42478) 859 859
key.split("")
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-61-55088ce8a832> in <module>() ----> 1 key.split("") ValueError: empty separator
snames
['AGO02concat', 'AGO08concat', 'AGO09concat', 'AGO11concat']
revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")
'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTTGTATTAGAGATGGGGCTGAAGTCAAACCCTGACCTGAACAGTGAGAAAATCTGGTGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCCTCTGCCTCCCTCAAGAAGAAATCAAGAGKGGAAAGGAGCAGGGCGGGAGGTATGGGAAAGGAAGAATGGAATT'
# get consens seq and variant site index
clust = []
avars = refvars(arr.view(np.uint8), PSEUDO_REF)
dat = b"".join(avars.view("S1")[:, 0]).decode()
snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
clust.append("ref_{}:{}-{}\n{}".format(*region, dat))
# or, just build variant string (or don't...)
# write all loci with [locids, nsnps, npis, nindels, ?]
for key in keys:
clust.append("{}\n{}".format(key, rdict[key]))
clust.append("SNPs\n" + snpstring)
clusts.append("\n".join(clust))
# advance locus counter
lidx += 1
# write chunks to file
if not lidx % 1000:
outfile.write(
str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
clusts = []
# write remaining
if clusts:
outfile.write(
str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-37-7858eebe5047> in <module>() 43 arr = np.zeros((len(rdict), len(refs)), dtype=bytes) 44 for idx, key in enumerate(keys): ---> 45 arr[idx] = list(rdict[key]) 46 47 # get consens seq and variant site index ValueError: cannot copy sequence with size 231 to array axis with dimension 535
step.remote_build_ref_clusters()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-16-2a0e5422841e> in <module>() ----> 1 step.remote_build_ref_clusters() ~/Documents/ipyrad/ipyrad/assemble/cluster_across.py in remote_build_ref_clusters(self) 588 arr = np.zeros((len(rdict), len(read.seq)), dtype=bytes) 589 for idx, key in enumerate(keys): --> 590 arr[idx] = list(rdict[key]) 591 592 # get consens seq and variant site index ValueError: cannot copy sequence with size 543 to array axis with dimension 559
from ipyrad.assemble.clustmap import *
self = Step3(data, 8, True, ipyclient)
self.run()
[####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:05:42 | join unmerged pairs | s3 | [####################] 100% 0:02:17 | dereplicating | s3 | [####################] 100% 0:00:39 | splitting dereps | s3 | [####################] 100% 1:53:03 | mapping reads | s3 | [####################] 100% 1:40:12 | building clusters | s3 | [####################] 100% 0:00:24 | calc cluster stats | s3 |
self.data.run("45")
Assembly: 5-tortas [####################] 100% 0:15:41 | inferring [H, E] | s4 | [####################] 100% 0:00:26 | calculating depths | s5 | [####################] 100% 0:00:45 | chunking clusters | s5 | [####################] 100% 16:10:59 | consens calling | s5 | [####################] 100% 0:01:31 | indexing alleles | s5 |
self.remote_index_refs()
self.remote_run(
function=concat_multiple_edits,
printstr=("concatenating ", "s3"),
args=(),
)
self.remote_run(
function=merge_end_to_end,
printstr=("join unmerged pairs ", "s3"),
args=(False, False,),
)
[####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:05:49 | join unmerged pairs | s3 |
self.remote_run(
function=dereplicate,
printstr=("dereplicating ", "s3"),
args=(self.nthreads,),
threaded=True,
)
[####################] 100% 0:02:22 | dereplicating | s3 |
self.remote_run(
function=split_endtoend_reads,
printstr=("splitting dereps ", "s3"),
args=(),
)
[####################] 100% 0:00:36 | splitting dereps | s3 |
self.remote_run(
function=mapping_reads,
printstr=("mapping reads ", "s3"),
args=(self.nthreads,),
threaded=True,
)
[########## ] 50% 1:18:01 | mapping reads | s3 |
sample = list(self.data.samples.values())[0]
merge_end_to_end(self.data, sample, True, True)
sample = list(data.samples.values())[0]
infiles = [
os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]
infile
'/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq'
strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
strand = "both"
nthreads=2
cmd = [
ip.bins.vsearch,
"--derep_fulllength", infile,
"--strand", strand,
"--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
"--threads", str(nthreads),
"--fasta_width", str(0),
"--fastq_qmax", "1000",
"--sizeout",
"--relabel_md5",
]
cmd
['/home/deren/Documents/ipyrad/bin/vsearch-linux-x86_64', '--derep_fulllength', '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq', '--strand', 'plus', '--output', '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_derep.fastq', '--threads', '2', '--fasta_width', '0', '--fastq_qmax', '1000', '--sizeout', '--relabel_md5']
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
ip.logger.error("error inside dereplicate %s", errmsg)
raise IPyradWarningExit(errmsg)
s.remote_run(
function=dereplicate,
printstr=("dereplicating ", "s3"),
args=(s.nthreads,),
threaded=True,
)
[ ] 0% 0:01:55 | dereplicating | s3 |
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-10-9074a628ea2e> in <module>() 3 printstr=("dereplicating ", "s3"), 4 args=(s.nthreads,), ----> 5 threaded=True, 6 ) ~/Documents/ipyrad/ipyrad/assemble/clustmap.py in remote_run(self, printstr, function, args, threaded) 598 ready = [rasyncs[i].ready() for i in rasyncs] 599 self.data._progressbar(len(ready), sum(ready), start, printstr) --> 600 time.sleep(0.1) 601 if len(ready) == sum(ready): 602 break KeyboardInterrupt: