import ipyrad as ip
import ipyparallel as ipp
ipyclient = ipp.Client()
/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: Controller appears to be listening on localhost, but not on this machine. If this is true, you should specify Client(...,sshserver='you@sacra') or instruct your controller to listen on an external IP. RuntimeWarning)
data = ip.load_json("6-tortas/test.json")
test2 = data.branch("test2")
#test2.run("5", force=True)
test2.remote_calculate_depths()
test2.remote_make_chunks()
loading Assembly: test from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-4-a594ca197022> in <module>() 2 test2 = data.branch("test2") 3 #test2.run("5", force=True) ----> 4 test2.remote_calculate_depths() 5 test2.remote_make_chunks() AttributeError: 'Assembly' object has no attribute 'remote_calculate_depths'
self = Step5(test2, True, ipyclient)
self.remote_calculate_depths()
self.remote_make_chunks()
loading Assembly: test from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json [####################] 100% 0:00:14 | calculating depths | s5 | [####################] 100% 0:00:31 | chunking clusters | s5 |
data = test2
sample = data.samples["PZ03concat"]
isref = True
for sample in [sample]:
chunks = glob.glob(os.path.join(
data.tmpdir,
"{}.chunk.*".format(sample.name)))
chunks.sort(key=lambda x: int(x.split('.')[-1]))
chunkfile = chunks[1]
chunkfile
'/home/deren/local/src/ipyrad/tests/6-tortas/test2-tmpdir/PZ03concat.chunk.35195.35195'
p = Processor(data, sample, chunkfile, isref)
p.run()
p.counters, p.filters
({'name': 45206, 'heteros': 1576, 'nsites': 4168046, 'nconsens': 10011}, {'depth': 25179, 'maxh': 3, 'maxn': 2})
class Processor:
def __init__(self, data, sample, chunkfile, isref):
self.data = data
self.sample = sample
self.chunkfile = chunkfile
self.isref = isref
# prepare the processor
self.set_params()
self.init_counters()
self.init_arrays()
self.chroms2ints()
# run the processor
#self.run()
def run(self):
self.process_chunk()
self.write_chunk()
def set_params(self):
# set max limits
self.tmpnum = int(self.chunkfile.split(".")[-1])
self.optim = int(self.chunkfile.split(".")[-2])
self.este = self.data.stats.error_est.mean()
self.esth = self.data.stats.hetero_est.mean()
self.maxlen = self.data._hackersonly["max_fragment_length"]
if 'pair' in self.data.paramsdict["datatype"]:
self.maxhet = sum(self.data.paramsdict["max_Hs_consens"])
self.maxn = sum(self.data.paramsdict["max_Ns_consens"])
else:
self.maxhet = self.data.paramsdict["max_Hs_consens"][0]
self.maxn = self.data.paramsdict["max_Ns_consens"][0]
# not enforced for ref
if self.isref:
self.maxn = int(1e6)
def init_counters(self):
# store data for stats counters.
self.counters = {
"name": self.tmpnum,
"heteros": 0,
"nsites": 0,
"nconsens": 0,
}
# store data for what got filtered
self.filters = {
"depth": 0,
"maxh": 0,
"maxn": 0,
}
# store data for writing
self.storeseq = {}
def init_arrays(self):
# local copies to use to fill the arrays
self.catarr = np.zeros((self.optim, self.maxlen, 4), dtype=np.uint32)
self.nallel = np.zeros((self.optim, ), dtype=np.uint8)
self.refarr = np.zeros((self.optim, 3), dtype=np.int64)
def chroms2ints(self):
# if reference-mapped then parse the fai to get index number of chroms
if self.isref:
fai = pd.read_csv(
self.data.paramsdict["reference_sequence"] + ".fai",
names=['scaffold', 'size', 'sumsize', 'a', 'b'],
sep="\t")
self.faidict = {j: i for i, j in enumerate(fai.scaffold)}
self.revdict = {j: i for i, j in self.faidict.items()}
# ---------------------------------------------
def process_chunk(self):
# stream through the clusters
with open(self.chunkfile, 'rb') as inclust:
pairdealer = izip(*[iter(inclust)] * 2)
done = 0
while not done:
done, chunk = clustdealer(pairdealer, 1)
if chunk:
self.parse_cluster(chunk)
if self.filter_mindepth():
self.build_consens_and_array()
self.get_heteros()
if self.filter_maxhetero():
if self.filter_maxN_minLen():
self.get_alleles()
self.store_data()
def parse_cluster(self, chunk):
# get names and seqs
piece = chunk[0].decode().strip().split("\n")
self.names = piece[0::2]
self.seqs = piece[1::2]
# pull replicate read info from seqs
self.reps = [int(n.split(";")[-2][5:]) for n in self.names]
# ref positions
self.ref_position = (-1, 0, 0)
if self.isref:
# parse position from name string
rname = self.names[0].rsplit(";", 2)[0]
chrom, posish = rname.rsplit(":")
pos0, pos1 = posish.split("-")
# pull idx from .fai reference dict
chromint = self.faidict[chrom] + 1
self.ref_position = (int(chromint), int(pos0), int(pos1))
def filter_mindepth(self):
# exit if nreps is less than mindepth setting
if not nfilter1(self.data, self.reps):
self.filters['depth'] += 1
return 0
return 1
def build_consens_and_array(self):
# get stacks of base counts
sseqs = [list(seq) for seq in self.seqs]
arrayed = np.concatenate(
[[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]
).astype(bytes)
# ! enforce maxlen limit !
self.arrayed = arrayed[:, :self.maxlen]
# get unphased consens sequence from arrayed
self.consens = base_caller(
self.arrayed,
self.data.paramsdict["mindepth_majrule"],
self.data.paramsdict["mindepth_statistical"],
self.esth,
self.este,
)
# trim Ns from the left and right ends
self.consens[self.consens == b"-"] = b"N"
trim = np.where(self.consens != b"N")[0]
ltrim, rtrim = trim.min(), trim.max()
self.consens = self.consens[ltrim:rtrim + 1]
self.arrayed = self.arrayed[:, ltrim:rtrim + 1]
# update position for trimming
self.ref_position = (
self.ref_position[0],
self.ref_position[1] + ltrim,
self.ref_position[1] + ltrim + rtrim + 1,
)
def get_heteros(self):
self.hidx = [
i for (i, j) in enumerate(self.consens) if
j.decode() in list("RKSYWM")]
self.nheteros = len(self.hidx)
def filter_maxhetero(self):
if not nfilter2(self.nheteros, self.maxhet):
self.filters['maxh'] += 1
return 0
return 1
def filter_maxN_minLen(self):
if not nfilter3(self.consens, self.maxn):
self.filters['maxn'] += 1
return 0
return 1
def get_alleles(self):
# if less than two Hs then there is only one allele
if len(self.hidx) < 2:
self.nalleles = 1
else:
# array of hetero sites
harray = self.arrayed[:, self.hidx]
# remove reads with - or N at variable site
harray = harray[~np.any(harray == b"-", axis=1)]
harray = harray[~np.any(harray == b"N", axis=1)]
# get counts of each allele (e.g., AT:2, CG:2)
ccx = Counter([tuple(i) for i in harray])
# remove low freq alleles if more than 2, since they may reflect
# seq errors at hetero sites, making a third allele, or a new
# allelic combination that is not real.
if len(ccx) > 2:
totdepth = harray.shape[0]
cutoff = max(1, totdepth // 10)
alleles = [i for i in ccx if ccx[i] > cutoff]
else:
alleles = ccx.keys()
self.nalleles = len(alleles)
if self.nalleles == 2:
try:
self.consens = storealleles(self.consens, self.hidx, alleles)
except (IndexError, KeyError):
# the H sites do not form good alleles
ip.logger.info("failed at phasing loc, skipping")
ip.logger.info("""
consens %s
hidx %s
alleles %s
""", self.consens, self.hidx, alleles)
# todo return phase or no-phase and store for later
# ...
def store_data(self):
# current counter
cidx = self.counters["nconsens"]
self.nallel[cidx] = self.nalleles
self.refarr[cidx] = self.ref_position
# store a reduced array with only CATG
catg = np.array(
[np.sum(self.arrayed == i, axis=0) for i in [b'C', b'A', b'T', b'G']],
dtype='uint16').T
# do not allow ints larger than 65535 (uint16)
self.catarr[cidx, :catg.shape[0], :] = catg
# store the seqdata and advance counters
self.storeseq[cidx] = b"".join(list(self.consens))
self.counters["name"] += 1
self.counters["nconsens"] += 1
self.counters["heteros"] += self.nheteros
# ----------------------------------------------
def write_chunk(self):
# find last empty size
end = np.where(np.all(self.refarr == 0, axis=1))[0]
if np.any(end):
end = end.min()
else:
end = self.refarr.shape[0]
# write final consens string chunk
consenshandle = os.path.join(
self.data.tmpdir,
"{}_tmpcons.{}.{}".format(self.sample.name, end, self.tmpnum))
# write chunk.
if self.storeseq:
with open(consenshandle, 'wt') as outfile:
# denovo just write the consens simple
if not self.isref:
outfile.write(
"\n".join(
[">" + self.sample.name + "_" + str(key) + \
"\n" + self.storeseq[key].decode()
for key in self.storeseq]))
# reference needs to store if the read is revcomp to reference
else:
outfile.write(
"\n".join(
["{}:{}:{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}"
.format(
self.sample.name,
self.refarr[i][0],
self.refarr[i][1],
self.refarr[i][2],
0,
self.revdict[self.refarr[i][0] - 1],
self.refarr[i][1],
0,
#"{}M".format(refarr[i][2] - refarr[i][1]),
make_allele_cigar(
"".join(['.' if i.islower() else i for i
in self.storeseq[i].decode()]),
".",
"S",
),
#"{}M".format(len(self.storeseq[i].decode())),
"*",
0,
self.refarr[i][2] - self.refarr[i][1],
self.storeseq[i].decode(),
"*",
# "XT:Z:{}".format(
# make_indel_cigar(storeseq[i].decode())
# ),
) for i in self.storeseq.keys()]
))
# store reduced size arrays with indexes matching to keep indexes
tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
with h5py.File(tmp5, 'w') as io5:
# reduce catarr to uint16 size for easier storage
self.catarr[self.catarr > 65535] = 65535
self.catarr = self.catarr.astype(np.uint16)
io5.create_dataset(name="cats", data=self.catarr[:end])
io5.create_dataset(name="alls", data=self.nallel[:end])
io5.create_dataset(name="chroms", data=self.refarr[:end])
del self.catarr
del self.nallel
del self.refarr
# return stats
self.counters['nsites'] = sum([len(i) for i in self.storeseq.values()])
del self.storeseq
res = ipyclient.apply(process_chunks, *(data, sample, chunkfile, isref))
res.result()
statsdicts = Processor(data, sample, chunkfile, isref)
statsdicts.counters, statsdicts.filters
({'name': 10148, 'heteros': 1736, 'nsites': 4258854, 'nconsens': 10148}, {'depth': 25043, 'maxh': 4, 'maxn': 0})
aa = ipyclient[0].apply(Processor, )
<__main__.D at 0x7f167c837b70>
# write sequences to SAM file
combs1 = glob.glob(os.path.join(
data.tmpdir,
"{}_tmpcons.*".format(sample.name)))
combs1.sort(key=lambda x: int(x.split(".")[-1]))
combs1
[]
# write sequences to SAM file
combs1 = glob.glob(os.path.join(
data.tmpdir,
"{}_tmpcons.*".format(sample.name)))
combs1.sort(key=lambda x: int(x.split(".")[-1]))
# open sam handle for writing to bam
samfile = sample.files.consens.replace(".bam", ".sam")
with open(samfile, 'w') as outf:
# parse fai file for writing headers
fai = "{}.fai".format(data.paramsdict["reference_sequence"])
fad = pd.read_csv(fai, sep="\t", names=["SN", "LN", "POS", "N1", "N2"])
headers = ["@HD\tVN:1.0\tSO:coordinate"]
headers += [
"@SQ\tSN:{}\tLN:{}".format(i, j)
for (i, j) in zip(fad["SN"], fad["LN"])
]
outf.write("\n".join(headers) + "\n")
# write to file with sample names imputed to line up with catg array
counter = 0
for fname in combs1:
with open(fname) as infile:
# impute catg ordered seqnames
data = infile.readlines()
fdata = []
for line in data:
name, chrom, rest = line.rsplit(":", 2)
fdata.append(
"{}_{}:{}:{}".format(name, counter, chrom, rest)
)
counter += 1
outf.write("".join(fdata) + "\n")
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-138-6cbce1edccb2> in <module>() ----> 1 self.refarr AttributeError: 'Processor' object has no attribute 'refarr'
class Processor:
def __init__(self, data, sample, chunkfile, isref):
self.data = data
self.sample = sample
self.chunkfile = chunkfile
self.isref = isref
# prepare the processor
self.set_params()
self.init_counters()
self.init_arrays()
self.chroms2ints()
# run the processor
def run(self):
self.process_chunk()
self.write_chunk()
def set_params(self):
# set max limits
self.tmpnum = int(self.chunkfile.split(".")[-1])
self.optim = int(self.chunkfile.split(".")[-2])
self.este = self.data.stats.error_est.mean()
self.esth = self.data.stats.hetero_est.mean()
self.maxlen = self.data._hackersonly["max_fragment_length"]
if 'pair' in self.data.paramsdict["datatype"]:
self.maxhet = sum(self.data.paramsdict["max_Hs_consens"])
self.maxn = sum(self.data.paramsdict["max_Ns_consens"])
else:
self.maxhet = self.data.paramsdict["max_Hs_consens"][0]
self.maxn = self.data.paramsdict["max_Ns_consens"][0]
# not enforced for ref
if self.isref:
self.maxn = int(1e6)
def init_counters(self):
# store data for stats counters.
self.counters = {
"name": self.tmpnum,
"heteros": 0,
"nsites": 0,
"nconsens": 0,
}
# store data for what got filtered
self.filters = {
"depth": 0,
"maxh": 0,
"maxn": 0,
}
# store data for writing
self.storeseq = {}
def init_arrays(self):
# local copies to use to fill the arrays
self.catarr = np.zeros((self.optim, self.maxlen, 4), dtype=np.uint32)
self.nallel = np.zeros((self.optim, ), dtype=np.uint8)
self.refarr = np.zeros((self.optim, 3), dtype=np.int64)
def chroms2ints(self):
# if reference-mapped then parse the fai to get index number of chroms
if self.isref:
fai = pd.read_csv(
self.data.paramsdict["reference_sequence"] + ".fai",
names=['scaffold', 'size', 'sumsize', 'a', 'b'],
sep="\t")
self.faidict = {j: i for i, j in enumerate(fai.scaffold)}
self.revdict = {j: i for i, j in self.faidict.items()}
# ---------------------------------------------
def process_chunk(self):
# stream through the clusters
with open(self.chunkfile, 'rb') as inclust:
pairdealer = izip(*[iter(inclust)] * 2)
done = 0
while not done:
done, chunk = clustdealer(pairdealer, 1)
if chunk:
self.parse_cluster(chunk)
if self.filter_mindepth():
self.build_consens_and_array()
self.get_heteros()
if self.filter_maxhetero():
if self.filter_maxN_minLen():
self.get_alleles()
self.store_data()
def parse_cluster(self, chunk):
# get names and seqs
piece = chunk[0].decode().strip().split("\n")
self.names = piece[0::2]
self.seqs = piece[1::2]
# pull replicate read info from seqs
self.reps = [int(n.split(";")[-2][5:]) for n in self.names]
# ref positions
self.ref_position = (-1, 0, 0)
if self.isref:
# parse position from name string
rname = self.names[0].rsplit(";", 2)[0]
chrom, posish = rname.rsplit(":")
pos0, pos1 = posish.split("-")
# pull idx from .fai reference dict
chromint = self.faidict[chrom] + 1
self.ref_position = (int(chromint), int(pos0), int(pos1))
def filter_mindepth(self):
# exit if nreps is less than mindepth setting
if not nfilter1(self.data, self.reps):
self.filters['depth'] += 1
return 0
return 1
def build_consens_and_array(self):
# get stacks of base counts
sseqs = [list(seq) for seq in self.seqs]
arrayed = np.concatenate(
[[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]
).astype(bytes)
# ! enforce maxlen limit !
self.arrayed = arrayed[:, :self.maxlen]
# get unphased consens sequence from arrayed
self.consens = base_caller(
self.arrayed,
self.data.paramsdict["mindepth_majrule"],
self.data.paramsdict["mindepth_statistical"],
self.esth,
self.este,
)
# trim Ns from the left and right ends
self.consens[self.consens == b"-"] = b"N"
trim = np.where(self.consens != b"N")[0]
ltrim, rtrim = trim.min(), trim.max()
self.consens = self.consens[ltrim:rtrim + 1]
self.arrayed = self.arrayed[:, ltrim:rtrim + 1]
# update position for trimming
self.ref_position = (
self.ref_position[0],
self.ref_position[1] + ltrim,
self.ref_position[1] + ltrim + rtrim + 1,
)
def get_heteros(self):
self.hidx = [
i for (i, j) in enumerate(self.consens) if
j.decode() in list("RKSYWM")]
self.nheteros = len(self.hidx)
def filter_maxhetero(self):
if not nfilter2(self.nheteros, self.maxhet):
self.filters['maxh'] += 1
return 0
return 1
def filter_maxN_minLen(self):
if not nfilter3(self.consens, self.maxn):
self.filters['maxn'] += 1
return 0
return 1
def get_alleles(self):
# if less than two Hs then there is only one allele
if len(self.hidx) < 2:
self.nalleles = 1
else:
# array of hetero sites
harray = self.arrayed[:, self.hidx]
# remove reads with - or N at variable site
harray = harray[~np.any(harray == b"-", axis=1)]
harray = harray[~np.any(harray == b"N", axis=1)]
# get counts of each allele (e.g., AT:2, CG:2)
ccx = Counter([tuple(i) for i in harray])
# remove low freq alleles if more than 2, since they may reflect
# seq errors at hetero sites, making a third allele, or a new
# allelic combination that is not real.
if len(ccx) > 2:
totdepth = harray.shape[0]
cutoff = max(1, totdepth // 10)
alleles = [i for i in ccx if ccx[i] > cutoff]
else:
alleles = ccx.keys()
self.nalleles = len(alleles)
if self.nalleles == 2:
try:
self.consens = storealleles(self.consens, self.hidx, alleles)
except (IndexError, KeyError):
# the H sites do not form good alleles
ip.logger.info("failed at phasing loc, skipping")
ip.logger.info("""
consens %s
hidx %s
alleles %s
""", self.consens, self.hidx, alleles)
def store_data(self):
# current counter
cidx = self.counters["nconsens"]
self.nallel[cidx] = self.nalleles
self.refarr[cidx] = self.ref_position
# store a reduced array with only CATG
catg = np.array(
[np.sum(self.arrayed == i, axis=0)
for i in [b'C', b'A', b'T', b'G']],
dtype='uint32').T
self.catarr[cidx, :catg.shape[0], :] = catg
# store the seqdata and advance counters
self.storeseq[cidx] = b"".join(list(self.consens))
self.counters["name"] += 1
self.counters["nconsens"] += 1
self.counters["heteros"] += self.nheteros
# ----------------------------------------------
def write_chunk(self):
# find last empty size
end = np.where(np.all(self.refarr == 0, axis=1))[0]
if np.any(end):
end = end.min()
else:
end = self.refarr.shape[0]
# write final consens string chunk
consenshandle = os.path.join(
self.data.tmpdir,
"{}_tmpcons.{}.{}".format(self.sample.name, end, self.tmpnum))
# write chunk.
if self.storeseq:
with open(consenshandle, 'wt') as outfile:
# denovo just write the consens simple
if not self.isref:
outfile.write(
"\n".join(
[">" + self.sample.name + "_" + str(key) + \
"\n" + self.storeseq[key].decode()
for key in self.storeseq]))
# reference needs to store if the read is revcomp to reference
else:
outfile.write(
"\n".join(
["{}:{}:{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}"
.format(
self.sample.name,
self.refarr[i][0],
self.refarr[i][1],
self.refarr[i][2],
0,
self.revdict[self.refarr[i][0] - 1],
self.refarr[i][1],
0,
# why doesn't this line up with +2 ?
# make_indel_cigar(storeseq[i].decode()),
#"{}M".format(refarr[i][2] - refarr[i][1]),
"{}M".format(len(self.storeseq[i].decode())),
"*",
0,
## + 2 here
self.refarr[i][2] - self.refarr[i][1],
#len(self.storeseq[i].decode()),
self.storeseq[i].decode(),
"*",
# "XT:Z:{}".format(
# make_indel_cigar(storeseq[i].decode())
# ),
) for i in self.storeseq.keys()]
))
# store reduced size arrays with indexes matching to keep indexes
tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
with h5py.File(tmp5, 'w') as io5:
io5.create_dataset(name="cats", data=self.catarr[:end])
io5.create_dataset(name="alls", data=self.nallel[:end])
io5.create_dataset(name="chroms", data=self.refarr[:end])
del self.catarr
del self.nallel
del self.refarr
# return stats
self.counters['nsites'] = sum([len(i) for i in self.storeseq.values()])
del self.storeseq
#return self.counters, self.filters
with open(self.chunkfile, 'rb') as inclust:
pairdealer = izip(*[iter(inclust)] * 2)
for i in range(20):
_, chunk = clustdealer(pairdealer, 1)
if chunk:
self.parse_cluster(chunk)
if self.filter_mindepth():
print(self.reps)
self.build_consens_and_array()
# get stacks of base counts
sseqs = [list(seq) for seq in self.seqs]
arrayed = np.concatenate(
[[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]
).astype(bytes)
self.arrayed = arrayed[:, :self.maxlen]
# get consens call for each site, applies paralog-x-site filter
self.consens = base_caller(
arrayed,
self.data.paramsdict["mindepth_majrule"],
self.data.paramsdict["mindepth_statistical"],
self.esth,
self.este,
)
self.consens[self.consens == b"-"] = b"N"
trim = np.where(self.consens != b"N")[0]
ltrim, rtrim = trim.min(), trim.max()
self.consens = self.consens[ltrim:rtrim + 1]
self.arrayed = self.arrayed[:, ltrim:rtrim + 1]
print(self.ref_position)
# update position for trimming
self.ref_position = (
self.ref_position[0],
self.ref_position[1] + ltrim,
self.ref_position[1] + ltrim + rtrim + 1,
)
print(self.ref_position)
[5, 1] (1, 3481, 1) (1, 3481, 3748) [6] (1, 30443, 1) (1, 30443, 30713) [9, 1, 1] (1, 33785, 1) (1, 33785, 34014)
# min where it is not N
np.argmin(np.where(arr != "N"))
np.where(arr != "N")
(array([ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]),)
consens = arr
trim = np.where(consens != "N")[0]
ltrim, rtrim = trim.min(), trim.max()
consens = consens[ltrim:rtrim + 1]
print(consens)
print(pos[0] + ltrim, rtrim + 1)
['A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'a'] 7 18
arr[7:17]
array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'], dtype='<U1')
test2.run("5", force=True)
Assembly: test2 [####################] 100% 0:00:13 | calculating depths | s5 | [####################] 100% 0:00:25 | chunking clusters | s5 | [####################] 100% 0:04:25 | consens calling | s5 | [####################] 100% 0:01:13 | indexing alleles | s5 |
from ipyrad.assemble.consens_se import *
self = Step5(test2, True, ipyclient)
self.remote_calculate_depths()
self.remote_make_chunks()
[####################] 100% 0:00:17 | calculating depths | s5 | [####################] 100% 0:00:29 | chunking clusters | s5 |
self.data.tmpdir
test2.tmpdir
'/home/deren/local/src/ipyrad/tests/6-tortas/test2-tmpdir'
data = test2
sample = data.samples["PZ03concat"]
isref = True
for sample in [sample]:
chunks = glob.glob(os.path.join(
self.data.tmpdir,
"{}.chunk.*".format(sample.name)))
chunks.sort(key=lambda x: int(x.split('.')[-1]))
chunkfile = chunks[0]
chunkfile
'/home/deren/local/src/ipyrad/tests/6-tortas/test2-tmpdir/PZ03concat.chunk.35195.0'
# temporarily store the mean estimates to Assembly
este = data.stats.error_est.mean()
esth = data.stats.hetero_est.mean()
# get index number from tmp file name
tmpnum = int(chunkfile.split(".")[-1])
optim = int(chunkfile.split(".")[-2])
# prepare data for reading and use global maxfraglen
clusters = open(chunkfile, 'rb')
pairdealer = izip(*[iter(clusters)] * 2)
maxlen = data._hackersonly["max_fragment_length"]
# local copies to use to fill the arrays
catarr = np.zeros((optim, maxlen, 4), dtype=np.uint32)
nallel = np.zeros((optim, ), dtype=np.uint8)
refarr = np.zeros((optim, 3), dtype=np.int64)
# if reference-mapped then parse the fai to get index number of chroms
if isref:
fai = pd.read_csv(
data.paramsdict["reference_sequence"] + ".fai",
names=['scaffold', 'size', 'sumsize', 'a', 'b'],
sep="\t")
faidict = {j: i for i, j in enumerate(fai.scaffold)}
revdict = {j: i for i, j in faidict.items()}
# store data for stats counters.
counters = {
"name": tmpnum,
"heteros": 0,
"nsites": 0,
"nconsens": 0,
}
# store data for what got filtered
filters = {
"depth": 0,
"maxh": 0,
"maxn": 0,
}
# store data for writing
storeseq = {}
## set max limits
if 'pair' in data.paramsdict["datatype"]:
maxhet = sum(data.paramsdict["max_Hs_consens"])
maxn = sum(data.paramsdict["max_Ns_consens"])
else:
maxhet = data.paramsdict["max_Hs_consens"][0]
maxn = data.paramsdict["max_Ns_consens"][0]
# load the refmap dictionary if refmapping
done = 0
while counters["name"] < 20:
done, chunk = clustdealer(pairdealer, 1)
if chunk:
# get names and seqs
piece = chunk[0].decode().strip().split("\n")
names = piece[0::2]
seqs = piece[1::2]
# pull replicate read info from seqs
reps = [int(sname.split(";")[-2][5:]) for sname in names]
ref_position = (-1, 0, 0)
if isref:
# parse position from name string
rname = names[0].rsplit(";", 2)[0]
chrom, posish = rname.rsplit(":")
pos0, pos1 = posish.split("-")
# pull idx from .fai reference dict
chromint = faidict[chrom] + 1
ref_position = (int(chromint), int(pos0), int(pos1))
# apply filters and fill arrays
if nfilter1(data, reps):
# get stacks of base counts
sseqs = [list(seq) for seq in seqs]
arrayed = np.concatenate(
[[seq] * rep for seq, rep in zip(sseqs, reps)]
).astype(bytes)
arrayed = arrayed[:, :maxlen]
# get consens call for each site, applies paralog-x-site filter
consens = base_caller(
arrayed,
data.paramsdict["mindepth_majrule"],
data.paramsdict["mindepth_statistical"],
esth,
este,
)
print(rname)
print(len(consens), ref_position[2] - ref_position[1])
# apply a filter to remove low coverage sites/Ns that
# are likely sequence repeat errors. This is only applied to
# clusters that already passed the read-depth filter (1)
# and is not applied to ref aligned data which is expected to
# have internal spacers that we leave in place here.
if b"N" in consens:
if not isref:
try:
consens, arrayed = removerepeats(consens, arrayed)
except ValueError:
ip.logger.info(
"Caught bad chunk w/ all Ns. Skip it.")
continue
else:
pass #print("here")
# get hetero sites
hidx = [
i for (i, j) in enumerate(consens) if j.decode()
in list("RKSYWM")]
nheteros = len(hidx)
# filter for max number of hetero sites
if nfilter2(nheteros, maxhet):
# filter for maxN, & minlen
if nfilter3(consens, maxn):
# counter right now
current = counters["nconsens"]
# get N alleles and get lower case in consens
consens, nhaps = nfilter4(consens, hidx, arrayed)
# store the number of alleles observed
nallel[current] = nhaps
# store a reduced array with only CATG
catg = np.array(
[np.sum(arrayed == i, axis=0)
for i in [b'C', b'A', b'T', b'G']],
dtype='uint32').T
catarr[current, :catg.shape[0], :] = catg
refarr[current] = ref_position
# store the seqdata for tmpchunk
storeseq[current] = b"".join(list(consens))
counters["name"] += 1
counters["nconsens"] += 1
counters["heteros"] += nheteros
else:
filters['maxn'] += 1
else:
filters['maxh'] += 1
else:
filters['depth'] += 1
# close infile io
clusters.close()
Contig0:3481-3748 267 267 Contig0:30443-30713 270 270 Contig0:33785-34014 229 229 Contig0:37976-38629 653 653 Contig0:40388-40731 343 343 Contig0:41920-42163 243 243 Contig0:44568-44784 216 216 Contig0:55978-56333 355 355 Contig0:62256-62713 457 457 Contig0:64734-65000 266 266 Contig0:67768-68840 960 1072 Contig0:69863-70402 539 539 Contig0:76480-76711 231 231 Contig0:81339-81852 513 513 Contig0:82469-82923 454 454 Contig0:83146-83367 221 221 Contig0:95645-95887 242 242 Contig0:101553-101788 235 235 Contig0:103469-103709 240 240 Contig0:104258-104956 698 698
67768-68840
-1072
ttt = list("NNNNNNNNNNNAAAAAAAAAAAAAAAAAAANNNNNNNNNNNNNN")
ttt = np.array(ttt, dtype=bytes)
ttt
array([b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N'], dtype='|S1')
ttt.nbytes
44
ttt = np.array(list(b"NNNNNNNNNNNAAAAAAAAAAAAAAAAAAANNNNNNNNNNNNNN"))
ttt.nbytes
352
np.argmin(consens == b"N"), np.argmax(consens == b"N")
(0, 69)
# find last empty size
end = np.where(np.all(refarr == 0, axis=1))[0]
if np.any(end):
end = end.min()
else:
end = refarr.shape[0]
# write final consens string chunk
consenshandle = os.path.join(
data.tmpdir,
"{}_tmpcons.{}.{}".format(sample.name, end, tmpnum))
# write chunk.
if storeseq:
with open(consenshandle, 'wt') as outfile:
# denovo just write the consens simple
if not isref:
outfile.write(
"\n".join([">" + sample.name + "_" + str(key) + \
"\n" + storeseq[key].decode() for key in storeseq]))
# reference needs to store if the read is revcomp to reference
else:
outfile.write(
"\n".join(
["{}:{}:{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}"
.format(
sample.name,
refarr[i][0],
refarr[i][1],
refarr[i][2],
0,
revdict[refarr[i][0] - 1],
refarr[i][1],
0,
# why doesn't this line up with +2 ?
# make_indel_cigar(storeseq[i].decode()),
#"{}M".format(refarr[i][2] - refarr[i][1]),
"{}M".format(len(storeseq[i].decode())),
"*",
0,
## + 2 here
refarr[i][2] - refarr[i][1],
storeseq[i].decode(),
"*",
# "XT:Z:{}".format(
# make_indel_cigar(storeseq[i].decode())
# ),
) for i in storeseq.keys()]
))
# store reduced size arrays with indexes matching to keep indexes
tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
with h5py.File(tmp5, 'w') as io5:
io5.create_dataset(name="cats", data=catarr[:end])
io5.create_dataset(name="alls", data=nallel[:end])
io5.create_dataset(name="chroms", data=refarr[:end])
del catarr
del nallel
del refarr
# return stats
counters['nsites'] = sum([len(i) for i in storeseq.values()])
del storeseq
return counters, filters
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-17-c185a885e7fd> in <module>() 110 # write final consens string chunk 111 consenshandle = os.path.join( --> 112 data.tmpdir, 113 "{}_tmpcons.{}.{}".format(sample.name, end, tmpnum)) 114 AttributeError: 'Assembly' object has no attribute 'tmpdir'
data = ip.load_json("6-tortas/tortas.json")
loading Assembly: tortas from saved path: ~/local/src/ipyrad/tests/6-tortas/tortas.json
data = ip.Assembly("tortas")
data.set_params("project_dir", "6-tortas")
data.set_params("sorted_fastq_path", "/home/deren/Documents/Tortoise/fastq-concats-GG-d10-118/*.gz")
data.set_params("restriction_overhang", ("CATG", "AATT"))
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "/home/deren/Documents/Tortoise/lgeorge.genome.fa")
data.get_params()
New Assembly: tortas 0 assembly_name tortas 1 project_dir ./6-tortas 2 raw_fastq_path 3 barcodes_path 4 sorted_fastq_path /home/deren/Documents/Tortoise/fastq-concats-GG-d10-118/*.gz 5 assembly_method reference 6 reference_sequence /home/deren/Documents/Tortoise/lgeorge.genome.fa 7 datatype pairddrad 8 restriction_overhang ('CATG', 'AATT') 9 max_low_qual_bases 5 10 phred_Qscore_offset 33 11 mindepth_statistical 6 12 mindepth_majrule 6 13 maxdepth 10000 14 clust_threshold 0.85 15 max_barcode_mismatch 0 16 filter_adapters 0 17 filter_min_trim_len 35 18 max_alleles_consens 2 19 max_Ns_consens (5, 5) 20 max_Hs_consens (8, 8) 21 min_samples_locus 4 22 max_SNPs_locus (20, 20) 23 max_Indels_locus (8, 8) 24 max_shared_Hs_locus 0.5 25 trim_reads (0, 0, 0, 0) 26 trim_loci (0, 0, 0, 0) 27 output_formats ['p', 's', 'v'] 28 pop_assign_file
data.ipcluster['threads'] = 4
data.run("2")
Assembly: tortas [####################] 100% 1:22:46 | processing reads | s2 |
data.stats.head()
state | reads_raw | reads_passed_filter | |
---|---|---|---|
AGO02concat | 2 | 11050294 | 11010349 |
AGO08concat | 2 | 13408401 | 13343190 |
AGO09concat | 2 | 15650127 | 15593137 |
AGO11concat | 2 | 12848936 | 12766537 |
AGO12concat | 2 | 12590531 | 12563936 |
from ipyrad.assemble.clustmap import *
self = Step3(data, 8, True, ipyclient)
self.run()
[####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:05 | concatenating | s3 | [####################] 100% 1:31:45 | join unmerged pairs | s3 | [####################] 100% 1:34:21 | dereplicating | s3 | [####################] 100% 0:45:59 | splitting dereps | s3 | [####################] 100% 6:06:27 | mapping reads | s3 | [####################] 100% 4:20:31 | building clusters | s3 | [####################] 100% 0:01:17 | calc cluster stats | s3 |
test = self.data.branch('test', [i.name for i in self.samples[:4]])
test.run('45')
Assembly: test [####################] 100% 0:10:21 | inferring [H, E] | s4 | [####################] 100% 0:00:16 | calculating depths | s5 | [####################] 100% 0:00:29 | chunking clusters | s5 | [####################] 100% 0:14:46 | consens calling | s5 | [####################] 100% 0:01:30 | indexing alleles | s5 |
test = ip.load_json("6-tortas/test.json")
loading Assembly: test from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json
test.run("5", force=True)
Assembly: test [####################] 100% 0:00:15 | calculating depths | s5 | [####################] 100% 0:00:27 | chunking clusters | s5 | [####################] 100% 0:14:45 | consens calling | s5 | [####################] 100% 0:01:30 | indexing alleles | s5 |
data = test
samples = list(data.samples.values())
from ipyrad.assemble.cluster_across import *
self = Step6(data, True, ipyclient)
self.remote_concat_bams()
[####################] 100% 0:00:54 | concatenating bams | s6 |
self.remote_build_ref_regions()
[####################] 100% 0:00:13 | building clusters | s6 |
# prepare i/o
bamfile = AlignmentFile(
os.path.join(
self.data.dirs.across,
"{}.cat.sorted.bam".format(self.data.name)),
'rb')
outfile = gzip.open(
os.path.join(
self.data.dirs.across,
"{}.catcons.gz".format(self.data.name)),
'wb')
# write header with sample names
outfile.write(
b" ".join([i.name.encode() for i in self.samples]) + b"\n")
49
def remote_build_ref_clusters(self):
"build clusters and find variants/indels to store"
# prepare i/o
bamfile = AlignmentFile(
os.path.join(
self.data.dirs.across,
"{}.cat.sorted.bam".format(self.data.name)),
'rb')
outfile = gzip.open(
os.path.join(
self.data.dirs.across,
"{}.catcons.gz".format(self.data.name)),
'wb')
# write header with sample names
snames = sorted([i.name for i in self.samples])
nsamples = len(snames)
outfile.write(
b" ".join([i.encode() for i in snames]) + b"\n")
# get clusters
iregions = iter(self.regions)
clusts = []
while 1:
# pull in the cluster
try:
region = next(iregions)
reads = bamfile.fetch(*region)
except StopIteration:
break
# pull in the reference for this region
refn, refs = get_ref_region(
data.paramsdict["reference_sequence"],
region[0], region[1] + 1, region[2] + 1,
)
# build cluster dict for sorting
rdict = {}
for read in reads:
rdict[read.qname] = read.seq
keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
# make empty array
arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)
# fill it
arr[0] = list(refs)
for idx, key in enumerate(keys):
# get start and stop from name
sname = key.rsplit("_", 1)[0]
rstart, rstop = key.rsplit(":", 2)[-1].split("-")
sidx = snames.index(sname)
# get start relative to ref
start = int(rstart) - int(region[1]) - 1
stop = start + int(rstop) - int(rstart)
print(sidx + 1, start, stop, arr.shape[1])
try:
arr[sidx + 1, int(start): int(stop)] = list(rdict[key])
except ValueError:
print(rdict[key])
# get consens seq and variant site index
print("")
clust = []
avars = refvars(arr.view(np.uint8), PSEUDO_REF)
dat = b"".join(avars.view("S1")[:, 0]).decode()
snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
clust.append("ref_{}:{}-{}\n{}".format(*region, dat))
# or, just build variant string (or don't...)
# write all loci with [locids, nsnps, npis, nindels, ?]
# for key in keys:
# clust.append("{}\n{}".format(key, rdict[key]))
# clust.append("SNPs\n" + snpstring)
# clusts.append("\n".join(clust))
outfile.write(str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()
iregions = iter(self.regions[:50])
def build_ref_clusters2(self):
"build clusters and find variants/indels to store"
# prepare i/o
bamfile = AlignmentFile(
os.path.join(
self.data.dirs.across,
"{}.cat.sorted.bam".format(self.data.name)),
'rb')
outfile = gzip.open(
os.path.join(
self.data.dirs.across,
"{}.catcons.gz".format(self.data.name)),
'wb')
# write header with sample names
snames = sorted([i.name for i in self.samples])
nsamples = len(snames)
outfile.write(
b" ".join([i.encode() for i in snames]) + b"\n")
# get clusters
clusts = []
while 1:
# pull in the cluster
try:
region = next(iregions)
reads = bamfile.fetch(*region)
except StopIteration:
break
# pull in the reference for this region
refn, refs = get_ref_region(
data.paramsdict["reference_sequence"],
region[0], region[1] + 1, region[2] + 1,
)
# build cluster dict for sorting
rdict = {}
for read in reads:
rdict[read.qname] = read.seq
keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
# make empty array
arr = np.zeros((len(rdict), len(refs)), dtype=bytes)
# fill it
arr[0] = list(refs)
for idx, key in enumerate(keys):
# get start and stop from name
sname = key.rsplit("_", 1)[0]
rstart, rstop = key.rsplit(":", 2)[-1].split("-")
# get start relative to ref
start = int(rstart) - int(region[1]) - 1
stop = start + int(rstop) - int(rstart)
try:
sequence = list(rdict[key])
arr[idx, int(start): int(stop)] = sequence
except ValueError:
print(key)
print(sname, len(sequence), start, stop, arr.shape[1])
print(rdict[key])
return arr
build_ref_clusters2(self)
PZ03concat_10:1:67768-68840 PZ03concat 960 0 1072 1301 CATGAGATGCTAACGAGGTCTCTCAAAGCTCCAGGCCCAGCTGACTGACAGCTTTGCATACTGTTGTTCTTNNNNNNNNNNNNNNNNNNNNTTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCNCCTCCCCTCAATAATGATACTTAGCACTTGATGAATTNNNNNNNNNNNNNNNNNNNNAATTAAATAAGCCTTACCTGCTGTGAAATGTCAATACTATTTTGAAGATGGGGAAACCGAGGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTTCTCAGGGGTCCAGCAACCTTCAAGCCCTTGTTCTCAAGATCCCCTTTGGAATAAGACCTCGTCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAGTCCAGTCCCCTGCCCTCATGGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
array([[b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C', b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G', b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T', b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C', b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T', b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G', b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G', b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G', b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T', b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A', b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A', b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C', b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T', b'T'], [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C', b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G', b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T', b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C', b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T', b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G', b'G', b'C', b'T', b'G', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G', b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G', b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T', b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A', b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A', b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C', b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T', b'T'], [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C', b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G', b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T', b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C', b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T', b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G', b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G', b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G', b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T', b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A', b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A', b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C', b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T', b'T'], [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C', b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G', b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T', b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C', b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T', b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G', b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G', b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G', b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T', b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A', b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A', b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C', b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T', b'T']], dtype='|S1')
# get consens seq and variant site index
print("")
clust = []
avars = refvars(arr.view(np.uint8), PSEUDO_REF)
dat = b"".join(avars.view("S1")[:, 0]).decode()
snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
clust.append("ref_{}:{}-{}\n{}".format(*region, dat))
# or, just build variant string (or don't...)
# write all loci with [locids, nsnps, npis, nindels, ?]
# for key in keys:
# clust.append("{}\n{}".format(key, rdict[key]))
# clust.append("SNPs\n" + snpstring)
# clusts.append("\n".join(clust))
outfile.write(str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()
remote_build_ref_clusters(self)
1 0 234 234 2 0 234 234 3 0 234 234 4 0 234 234 1 0 919 919 2 0 267 919 3 0 919 919 4 0 261 919 1 201 462 462 2 201 462 462 3 201 462 462 4 0 462 462 1 0 237 237 2 0 237 237 3 0 237 237 4 0 237 237 1 0 238 238 2 0 238 238 3 0 238 238 4 0 238 238 1 0 590 960 2 0 590 960 3 0 1054 960 4 0 636 960 1 0 257 257 2 0 248 248 1 0 887 887 2 78 289 887 1 0 434 623 2 222 623 623 1 0 510 674 2 271 510 674 3 271 510 674 4 271 674 674 3 0 328 328 1 244 494 649 2 0 494 649 3 0 649 649 4 0 494 649 1 0 545 545 2 0 545 545 3 0 545 545 4 0 545 545 1 254 501 1012 2 254 1012 1012 4 0 501 1012 2 0 202 202 1 0 827 827 1 0 243 243 2 0 243 243 3 0 243 243 4 0 243 243 1 0 378 378 2 163 378 378 1 0 223 223 2 0 223 223 3 0 223 223 4 0 223 223 1 0 562 562 2 0 228 562 3 0 228 562 4 0 228 562 1 0 254 254 2 0 254 254 3 0 254 254 4 0 254 254 4 0 259 259 1 0 468 468 2 58 273 468 3 0 273 468 1 0 775 775 2 343 578 775 3 0 578 775 4 343 578 775 1 0 211 211 3 0 211 211 1 0 228 228 2 0 228 228 3 0 228 228 4 0 228 228 4 0 854 854 1 0 238 238 2 0 238 238 3 0 238 238 4 0 238 238 1 274 765 765 2 0 765 765 3 0 765 765 4 274 765 765 1 0 219 219 2 1 219 219 3 1 219 219 4 1 219 219 1 180 422 422 2 0 422 422 3 180 422 422 4 180 422 422 1 0 231 231 2 0 231 231 3 0 231 231 4 0 231 231 1 44 288 695 2 44 288 695 3 0 695 695 4 44 695 695 1 0 251 251 2 0 251 251 3 0 251 251 4 0 251 251 1 0 242 552 2 0 242 552 3 0 552 552 4 0 242 552 1 0 258 258 2 0 258 258 3 0 258 258 4 0 258 258 1 0 583 868 2 376 868 868 3 376 583 868 4 0 874 874 2 0 459 539 3 0 459 539 4 0 539 539 1 118 383 383 2 118 383 383 4 0 383 383 1 274 737 737 2 0 737 737 3 274 737 737 4 0 737 737 1 0 249 249 2 0 249 249 3 4 249 249 4 0 249 249 1 392 858 858 3 0 801 858 1 2 219 219 2 0 219 219 2 502 992 992 3 0 760 992 4 502 760 992 2 0 207 207 3 0 448 448 1 0 481 481 2 0 215 481 3 0 215 481 1 0 536 536 2 276 536 536 3 0 536 536 4 0 536 536 1 0 631 769 2 0 631 769 3 0 631 769 4 0 769 769 2 0 569 842 3 358 842 842 4 0 842 842 1 139 620 620 2 356 620 620 3 0 620 620 4 0 620 620 1 216 471 471 2 0 471 471 3 0 471 471 4 216 471 471 3 0 215 215 1 145 367 598 2 145 598 598 3 0 367 598 4 145 598 598 1 0 223 223 2 0 223 223 3 0 223 223 4 0 223 223 3 0 210 210 4 0 476 476 1 272 505 690 2 272 690 690 3 157 505 690 4 0 505 690 1 247 532 532 2 0 460 532 3 247 461 532 1 0 217 389 2 0 389 389 3 0 389 389 4 0 1118 960 1 0 514 514 2 268 514 514 3 268 514 514 4 268 514 514 4 0 270 270 1 142 398 748 2 142 398 748 3 0 748 748 4 142 398 748 3 0 546 546 2 0 258 258 4 0 258 258 1 0 225 225 2 0 225 225 3 0 225 225 4 0 225 225 1 1047 1689 1929 2 338 1262 1929 2 1469 1929 1929 3 1046 1689 1929 4 0 820 1929 4 1047 1689 1929 1 136 355 607 2 136 355 607 3 136 607 607 4 0 355 607 1 0 509 509 2 0 415 509 3 0 509 509 4 0 247 509 1 0 355 355 1 0 490 491 2 258 491 491 3 0 490 491 4 258 490 491 4 0 835 835 3 0 227 227 1 0 214 214 2 0 214 214 3 0 214 214 1 431 1061 1195 2 235 1235 1195 3 431 1061 1195 4 0 1235 1195 CATGGAGTTGATGTGCCAAGGACAGATCAGAAACCAAATAGAGGGATAAAGAGAGTCCCTTTACAACTTGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACAAGGCTCTTCACTTTAACGCTCAGGCTTTGTTTACACTGGGAGATGGTGTTCCAACACTGGCAGAAAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGGGGCTTGACTCCTGTTTGTGCCTCTGAAAATCTCTCCTTTGGTGCTCGATGCTGTGCTTCTTACCACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTTGTCAGCAGCAGAAGTGAATCTTAGCAGCAGGCATAAACATATTTGCCTCTTCCGTTCCCAAGAAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTACATATCAATGGCTTTTCTAATGGCCAGTGCTGCAAACTTCACTTGGGATCTGAAAATCAGCTGGGTTCTCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 4 0 762 762 1 0 510 510 2 0 383 510 3 0 383 510 4 140 383 510 1 450 677 677 2 450 677 677 3 0 677 677 4 450 677 677 1 185 518 518 2 285 518 518 3 285 518 518 4 0 518 518 1 0 510 644 2 0 644 644 3 0 510 644 4 0 510 644 2 0 590 590 4 0 590 590 1 0 222 222 2 0 222 222 3 0 222 222 4 0 222 222 1 0 507 507 2 0 507 507 3 0 507 507 4 0 507 507 1 0 551 551 1 0 442 442 4 191 442 442 1 296 553 553 2 296 553 553 3 296 553 553 4 0 553 553 1 0 438 438 2 0 258 438 3 0 258 438 4 0 438 438 1 0 352 767 2 109 767 767 3 109 352 767 4 109 352 767 1 0 240 241 2 0 241 241 3 0 240 241 4 0 240 241 3 0 416 416 1 0 516 516 2 0 227 516 3 0 227 516 4 0 227 516 2 0 327 327 1 0 892 892 2 278 892 892 3 186 723 892 4 278 892 892 1 0 470 470 2 0 470 470 3 0 470 470 4 0 470 470 1 0 1205 1383 CATGAGAGGTTATGCAGATGGAATGTGAAAATGATCAATAAAGAGAGTTGAAGTATTCCAGGCACCTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGAGTTTCCTAGCAAAGGGCTGCTCATTTCATCCTGTACCTAGGCTGTCTCACTGCCTCCTGCTAGGGATTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTAGACTTTGTCTCCAAAGCAGCTTGTTTTGTTCATTCAATGAAAAGGTGCCACGTTCTCTTCCCTAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATCTGCAAAGGGAAAAAAAAAAAGGGGGAGCTATGGGGGGTTAGCATCCTTGCGGGAGATAGATAAGTACTTAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCTGTCAGTCTAGCTCCTGCTACTCAAGGAGGTGGATTACCTACCTCAATAGGAGAACCTCCTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 2 99 643 1383 2 975 1205 1383 3 424 1205 1383 4 99 643 1383 4 975 1383 1383 1 0 255 410 2 0 255 410 3 0 255 410 4 0 410 410 1 0 249 1239 1 640 1239 1239 2 0 249 1239 2 640 1239 1239 3 0 1239 1239 CATGTCATTTTCTTTGAGTTTTTCCCCTTTCACCTGGAAACTATCAAACCCTTGAAACTCTGGGTTGTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATTGTTCTGACCATCTTCCTGCGGCTGAGGCATCTCAAGAAACTGAGAAACGTGAGGTGCGTTAGGGACCAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGTCATTGCCCATTCTATGCGATATATCCGGGGATCGAAAAAGCACCCACAAGGTGCCATCTGGAAACCTGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGCCAAGACCCGCCAAAGGCCAGATCCTCAGCTGGTGTAAATTGGTGGTGTAGTTCCGGTGACTTCAATGGGGCTGAGCCACTGTACACCAGCTAAGGGTCCAGCACAAAGGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAGCGTCACTTGCACCGAACTGGGATTTGGGGCAAATGCAGTCTGGAGTCAGAGGGTGCAGGTTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 4 0 1239 1239 CATGTCATTTTCTTTGAGTTTTTCCCCTTTCACCTGGAAACTATCAAACCCTTGAAACTCTGGGTTGTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATTGTTCTGACCATCTTCCTGCGGCTGAGGCATCTCAAGAAACTGAGAAACGTGAGGTGCGTTAGGGACCAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATGTATTGGGGGTTGCTTGGGACTGGGTGGTAAGGTGGGTAGGGGACTGTTTGGACAGGACTCTTGANNNNNNCGTGGGTGTCCAGAAGGTAAGCGCTTGGTGAACTTGATCCACCCGTGAGTTTATAAACTGACGGTTGCACGAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGTGGTGTAGTTCCGGTGACTTCAATGGGGCTGAGCCACTGTACACCAGCTAAGGGTCCAGCACAAAGGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAGCGTCACTTGCACCGAACTGGGATTTGGGGCAAATGCAGTCTGGAGTCAGAGGGTGCAGGTTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 1 0 254 254 2 0 254 254 3 0 254 254 4 0 254 254 2 0 511 511 4 0 511 511 1 584 1015 1015 2 584 1015 1015 3 0 1016 1015 CATGTCTGAGTCTGTATCTCCTGAAGATGGAGCTATCCAGTTCCCTGCTACATATGTGCCAGTTCAACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCATTAACATACATAGAGAGACCTAGGCATCCCCTAGTATTATTTCAGGTAGAAGCAACATACAACTGTTGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTCCTAAGGGAGGCGAAAGGCCAAGTTAAAGGATGCCCAACTAGGAGAGGAGCAATTNNNNNNNNNNNNNNNNCATGGGAGGGGAGCGGCTGCAATGATGTCTGGAAAGAAGGGGAACAAAAGGTCTTGTGGTTTGGATGCAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATCTCCAGATCTGTGTGAACTTCAGTTTTAGGAGCTGTGAACCTCTTGAGCGATACTTGTAGGAGATTTCAAATTACCCCTCTGCTGTGTAATAGGGATGGAAGAAGCCTCCAGACCAAAGTAGTTTAGCACAAGCGGATGCCCCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNANAAAACNCTGCCCTG 4 584 1015 1015 1 0 234 239 2 0 234 239 3 0 239 239 4 0 239 239 1 272 541 541 2 0 541 541 3 316 541 541 4 0 541 541 1 0 217 217 2 0 217 217 3 0 217 217 1 361 582 788 2 361 582 788 3 0 788 788 4 361 582 788 1 0 558 558 1 468 722 722 2 468 722 722 3 0 722 722 4 468 722 722 2 0 214 215 3 0 214 215 4 0 215 215 1 0 230 230 2 0 230 230 3 0 230 230 4 0 230 230 1 0 427 427 2 0 427 427 3 0 427 427 4 0 427 427 1 0 847 848 2 134 847 848 3 0 363 848 3 625 848 848 4 134 363 848 4 625 847 848 1 0 247 247 2 0 247 247 3 0 247 247 4 0 247 247 1 0 217 736 1 496 736 736 2 0 217 736 2 496 736 736 3 0 736 736 4 0 217 736 4 496 736 736 1 0 305 451 2 0 451 451 4 0 451 451 1 0 244 244 2 0 244 244 3 0 244 244 4 0 244 244 1 0 265 265 4 0 265 265 1 116 390 390 2 0 341 390 3 116 341 390 4 116 341 390 1 0 237 237 2 0 237 237 3 0 237 237 4 0 237 237 1 231 487 487 2 231 487 487 3 231 487 487 4 0 487 487 1 0 223 223 2 0 223 223 3 0 223 223 4 0 223 223 1 0 226 226 2 0 226 226 4 0 226 226 1 305 528 528 2 305 528 528 3 0 528 528 4 305 528 528 4 0 263 263 1 0 583 583 2 94 583 583 3 94 307 583 1 612 1142 1142 2 337 1142 1142 3 0 1142 1142 CATGGGAAGCTTTGATCCCTTTCAGGAATCCTTAAGGGTGTGACATTTGAAGCTCTTCCAGAGACAGGATGNNNNNNNNNNNNNNNNNTGGTAAAGCCGTAAATCTTGGAAGCTCCGTGTTTGAGCCAAATCTACCCTAGGATGAGATGTAGCCTGCAGCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCCTCCGGAGGTTGTGGAATCTCCATCTCTGGAGATATTTAAGAGTAGTTTAGATAACTGTCTATCTGGGATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATCTATGAATCCTTTTATGTCTCCACTATCTGTGGTGGAGCAATTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGAAGCTGGTTAGTTTGTTTTATTCCATTTCCTTCACTGCTGGGTTCGCCTCCTTCATCTCAGCAGCCTGGAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTTGATTTCCTTCACTTCTCAGCATTTCCCCTCTTGTTAGGACCTGATTCTCTGCAATCCTGTGCGTCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATGCAAGGCAAGGGAGAATCAGGCCAATGGAG 4 612 1142 1142 4 0 256 256 1 276 807 807 2 276 807 807 3 276 807 807 4 0 807 807 2 0 212 212 1 0 484 675 2 0 484 675 3 0 675 675 4 0 484 675 2 0 1255 1072 AATTCTTTGAGCTGCTGTTGATAAATCAGCCTGGAAGAGGTGGAACGTTGTGAAGATTCCCGGGGTACCCAGCGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCTATGCAGGTGAGCAAGAATCAGGCGTCCTAGACTCCACGGTGCAAATAAGTGACGGCCACGTTAACAYGACCCTGTACTGAAAACCCACGAACCACTATGCCTACCTTCATGCCTCCAGCTTCCATCCCGGACACACCACACGATCCATCGTCCACAGCCAAGTGCTGAGGTACAACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGACGTGTACCTAGAAGTCTCCTGCTGCAAGACAAACCCAAGAAAGAAACCAACAGGACTCCGCTGGCCATCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCACAGGCCTTGGGTGGCAGGCCAGTCCTCGCCCCCAGACAGCCTGCCAACCTGAAGCATATTCTCACCAGTAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGCAACAAACCTTGATGCCAGCTATGCCCACATATCTACACCAGCGACACCATCACAGGATCTAACCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTCCACCAATGTAATATATGCCATCATGTGCCAGCAATGCCCCTCTGCTATGTACATCGGCCARACTGGACAGTCTCTACGGAAAAGGATAAATGGACACAAATCAGATATTAGGAATGGCAATCNNNNNNNNNNNNNNNNNNNNNNCT 3 0 496 1072 3 835 1072 1072 4 0 1072 1072 AATTCTTTGAGCTGCTGTTGATAAATCAGCCTGGAAGAGGTGGAACGTTGTGAAGATTCCCGGGGTACCCAGCGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAAATAAGTGACGGCCACGTTAACACGACCCTGTACTGAAAACCCACGAACCACTATGCCTACCTTCATGCCTCCAGCTTCCATCCCGGACACACCACACGATCCATCGTCCACAGCCAAGTGCTGAGGTACAACTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGACGTGTACCTAGAAGTCTCCTGCTGCAAGACAAACCCAAGAAAGAAACCAACAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACAAACCTCGATGCCAACTCTGCCCACATATCTACACCAGCGACACCATCACAGGATCTAACCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTGCCAGCAATGCCCCTCTGCTATGTACATCGGMCAAACTGGMCAGTCTCTAYGGAAAAGGATAAATGGACACAAATCAGATATTAGGAATGGCAATTNNNNNNNNNNNNNNNGAGAACACT 1 0 227 227 2 0 227 227 3 0 227 227 4 0 227 227 2 0 220 220 3 0 220 220 4 0 220 220 1 0 230 230 2 0 230 230 3 0 230 230 4 0 230 230 1 0 222 222 2 0 222 222 3 0 222 222 4 0 222 222 3 0 606 606 4 0 265 265 1 0 229 229 2 0 229 229 3 0 229 229 4 0 229 229 1 0 238 238 2 0 238 238 3 0 238 238 4 0 238 238 1 0 704 704 2 203 704 704 3 0 704 704 4 0 704 704 1 0 573 573 2 0 573 573 3 0 573 573 4 0 573 573 1 0 254 254 3 0 254 254 1 0 227 227 2 0 227 227 3 0 227 227 4 0 227 227 1 0 220 220 2 0 220 220 2 0 572 572 2 0 413 413 1 0 651 930 2 0 930 930 3 0 651 930 4 0 651 930 1 0 220 220 2 0 220 220 3 0 220 220 4 0 220 220 1 0 350 350 3 0 350 350 4 0 209 350 4 0 542 542 2 118 1169 1078 3 0 1169 1078 AAATTTGCTTCTCTGTTAACTGGGGAGAATAAACCTTCCTTTCCCCCGCNNTGTGCTCTGCCTTGTCTAGCTAGATTGTCAGCTCTTTGAGACAGGGACTGTCTCTTACAGTGCGTAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTGCCTTTAACAATCTCACCTCCTTATATGGCTAAACTATTCTTTTTTTGAACTTGGCTTTGTTTACAGTCCACTTAAAATATTTAGACTCTGGCTGGGTTAGACAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTTATTTACATCCTATAAAATAAAGTGTTATTGGCAATATAGCAGCTTCCTAGCTCCATTTTAACTATTGACTGGGGTAACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATTGAGTGGTACACAGGAATGGCGGAGCTCAGTCAGACCGCTGGGCCATCTAGCCCAAGGTTCTGCCTTTGCAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATTGAGTGCTCCATCCCATCATCCACTCCCAGCTTCTGGAAGTCAGAGGTCTAGA 1 271 518 518 2 271 518 518 3 271 518 518 4 0 518 518 1 0 256 256 2 0 256 256 3 0 256 256 4 0 256 256 4 0 679 679 1 0 258 258 2 0 258 258 4 0 258 258 1 152 501 501 2 152 390 501 3 152 390 501 4 0 501 501 1 0 717 717 2 0 244 717 2 502 717 717 3 0 717 717 4 0 244 717 1 405 622 809 2 0 622 809 4 405 809 809 1 387 617 617 2 387 617 617 4 0 617 617 1 1 233 233 2 1 233 233 3 0 233 233 4 1 233 233 1 349 605 605 2 349 605 605 3 349 605 605 4 0 605 605 1 0 498 498 2 0 498 498 3 0 498 498 4 0 498 498 1 577 1180 1180 2 340 1180 1180 3 571 1180 1180 4 0 1180 1180 CATGAAGGGCTGGTGCAACCATTTAGGCGACCGCCTAGGGCACAAAGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCAAGTAGGGGGGGGGAGGCGGCACGCGGGGGAGCCCCCCCCCCCAGCTCCCCCCTGCCCCGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCTCCCACCTCCCAGGCTTGTGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGGGGGGCGCAAGGTGGAAGTTTCACCTAGGGTGCGAAACATCTTTGCACCGGCCCTGCATCCATGACATGAGAAGTTGTGTCTTTTGGTCCCTGGGTGTCTGCCACATCAACACCACCACCCAATAGCTTTGGGGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCTGATTGGCAGTGCTTCCTAGTTCCCTGATTGATCCAGATGCTCTATTTAAGCCTGGAGGGGACAGAAGAAAATTTTCTGTACAAGTAGGTGGATCCTCAGCTGGCTGGTTTGACAGCACCTGCTCCTTTGCATTGCTTTGGCTCCTNNNNNNNNNNNNNNNNNNNNNCCCTTAGCTCTGGTCCCAGGCCCCTTACTCCACTTTGCCTACTAAGCTGAAGCACCCGTGCCCTAGCCATGACAGCTGGTAGCTGTGGCAAGATTGAAATGCAATAC 1 11 248 248 2 11 248 248 3 11 248 248 4 0 248 248 1 135 510 510 2 135 510 510 4 0 510 510 1 0 233 233 2 0 233 233 3 0 233 233 4 0 233 233 1 0 462 462 2 0 220 462 3 0 316 462 1 0 363 363 2 0 256 363 3 0 256 363 4 0 256 363 1 110 352 352 2 0 352 352 3 110 352 352 4 110 352 352 1 0 218 218 4 1 216 218 1 0 534 908 2 275 908 908 3 275 534 908 4 275 534 908 1 0 579 676 2 345 579 676 3 204 579 676 4 345 676 676 1 0 486 486 2 0 486 486 3 0 486 486 4 0 486 486 1 0 232 368 2 0 232 368 3 0 368 368 4 0 232 368 1 0 817 817 1 88 309 309 2 0 309 309 3 88 309 309 4 81 309 309 2 83 296 296 3 0 296 296 2 0 256 256 4 0 256 256 1 0 248 248 2 0 248 248 3 0 248 248 4 0 248 248 1 0 779 943 2 0 779 943 3 0 943 943 4 0 943 943 1 0 232 660 2 0 232 660 3 0 660 660 4 0 249 249 1 195 444 444 2 195 444 444 3 0 444 444 4 0 444 444 1 0 821 960 2 0 1059 960 3 0 1148 960 4 0 1059 960 2 184 711 711 4 0 453 711 1 0 755 755 1 0 249 490 2 0 249 490 3 0 490 490 4 0 249 490 1 0 612 612 3 0 612 612 4 0 612 612 4 0 259 259 1 0 419 419 2 0 246 419 3 0 246 419 4 0 419 419 1 0 232 515 2 0 470 515 3 0 232 515 4 0 515 515 1 0 725 725 2 220 438 725 3 0 438 725 4 104 438 725 1 0 248 248 2 0 248 248 3 0 248 248 4 0 248 248 2 0 465 465 4 0 465 465 3 0 429 429 1 0 844 844 2 0 224 844 2 611 844 844 3 0 224 844 3 611 844 844 4 0 224 844 4 449 844 844 1 201 441 441 2 201 441 441 3 201 441 441 4 0 441 441 1 121 353 353 2 121 353 353 3 121 353 353 4 0 353 353 1 0 253 540 2 0 253 540 3 0 540 540 4 0 253 540 1 644 1720 1604 2 644 1720 1604 3 133 1162 1604 CATGATTGCATCAAAAACCTACCACAAGAGTTCAAAGTTAGGGGAGANNNNNNNNNATACTCCTTCTCCAGGGAGAGCCAGTAAATGGCAGTTTAACTGCCTTATTAACAGCAACCCTTGTATCTGCCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTAGTTACCTTGAAGATAGGGTCTAGGATTAGCTGGCAGAAAGTCCTGGGCAGTTTCCTTCCATCAGNNNNNNNNNNNNNNNNNNNNNNNNTGCCAGAAGCAGGATCAAAATACCTAAAAAAAAAATCCAGTCAGAGGGGATTGTGGTTTCGCCATCTTGTTTAATTAGCACTGCTTATTCTGCTTGCCTTTCAAGATGGAGGGAAGGGTTATCAAGTCAAAGACCCGTTTTGAGATGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTATCGTGTATTCCTGATCCCATCCCCTAATATTTGTCTGATTAAATCTAGATTTAAAGTCCCTCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAGGATTCCTTCCTTTGGGTGCTACCATACTGTTGCTTCCTTACTCCCAGCCCAGGAGAGCTCAAGTATTCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAA 4 0 1720 1604 CATGTCTCTTCCCAAGGCAATGCCTATCAGTTCTTACCTTCAGTAGGGGTTTGCCCTCCTTGTCTTTATCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATTGCATCAAAAACCTACCACAAGAGTTCAAAGTTAGGGGAGAAAGCAGCTCATACTCCTTCTCCAGGGAGAGCCAGTAAATGGCAGTTTAACTGCCTTATTAACAGCAACCCTTGTATCTGCCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTAGTTACCTTGAAGATAGGGTCTAGGATTAGCTGGCAGAAAGTCCTGGGCAGTTTCCTTCCATCAGNNNNNNNNNNNNNNNNNNNNNNNTTTCCAGAAGCAGGATCAAAATACCTNAAAAAAAAATCCAGTCAGAGGGGATTGTGGTTTCGCCATCTTGTTTAATTAGCACTGCTTATTCTGCTTGCCTTTCAAGATGGAGGGAAGGGTTATCAAGTCAAAGACCCGTTTTGAGATGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTATCGTGTATTCCTGATCCCATCCCCTAATATTTGTCTGATTAAATCTAGATTTAAAGTCCCTCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAGGATTCCTTCCTTTGGGTGCTACCATACTGTTGCTTCCTTACTCCCAGCCCA 1 105 331 331 2 105 329 331 3 0 329 331 4 105 329 331 1 0 252 593 2 0 252 593 3 0 593 593 4 0 252 593 3 0 473 473 2 0 231 231 3 0 231 231 1 0 211 211 2 0 211 211 2 0 831 831 4 568 831 831 2 201 427 427 3 201 427 427 4 0 427 427 1 1 247 247 2 0 247 247 3 1 247 247 4 1 247 247 1 672 889 889 2 315 889 889 4 0 447 889 1 830 1085 1085 4 0 1085 1085 AATTTTATACCCGGGCCACAAAGAGGCGTCATTTTGCGGGGTCAGCTACAGCCCAGCAATATCATCTGCAGTGATACTTCCAGCTGTTCTCCTGCCTTGCTAGGCTGCAAGACTTCTCTGTGATTCATGNNNNNNNNNNNNNNNNNNNNNNNNNNAATTTTAACCAACCAACCAGCCCATCCTCCTCCTCCTCCATCTTTGGGCAGGCAGCGTCTTTGACTTGCATATCGAGAGCAGCTTACCAGTTGTGACCATCTCAGTTCGGGGACCGGCTGTCTCACTTCTGCAGTGCATGGGAAGCAATAACTACGGACAGATGGGTCCTAACCACTGTGGGGTCAGGTTATGTCATCCAACTTCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATCAGGGAAAGGGATTCTGCTCCTGGCACTTCCTGATCCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTGGTGGTTTTTCCAGACCACAGCAAGTTTCCTAATCATCAGTCAACATTATCAATACACTGGTCTCCNNNNNNNTCTGTCATCAGCCCCACGTGTCTTTAGTGAATGTCTGATGATGGTTGTAGTGACTTATCCCCAATGGAGGGAAATTCATGTCCTCCCATACCTAGACAACTGGCTAGTGAGGGGCAGATCCAAAAAATAAGTCCTCAGTCCTCTCCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 1 0 411 411 2 191 411 411 3 0 411 411 4 191 411 411 1 0 503 503 2 0 503 503 3 0 503 503 4 0 503 503 1 114 352 427 2 114 352 427 3 0 427 427 4 114 427 427 1 0 396 541 2 0 541 541 3 146 477 541 4 146 396 541 1 0 730 730 2 84 545 730 3 0 631 730 4 0 545 730 1 0 256 257 2 0 256 257 3 0 257 257 4 0 256 257 1 0 325 325 2 72 325 325 3 72 325 325 4 2 325 325 2 0 257 257 4 0 257 257 1 0 566 566 2 0 566 566 3 0 566 566 4 0 566 566 1 0 427 661 2 195 661 661 3 195 661 661 4 195 427 661 1 0 216 414 2 0 414 414 3 0 414 414 1 0 829 991 2 409 829 991 3 594 829 991 4 594 991 991 1 0 263 960 1 612 828 960 2 0 989 960 3 0 263 960 3 612 828 960 4 0 263 960 1 177 417 624 2 0 417 624 3 0 624 624 4 177 417 624 1 305 1351 1351 CATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCGCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAAGTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGGCAGGGGGGGCTGCAAGGTTATTGTAGTGAGGGCACAGTATTGTCACCCTTACTTTTGTGCTGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 2 305 562 1351 2 1099 1351 1351 3 0 1351 1351 AATTTTAGCATTCACATTGTTTTTTCCCTTTTACTGCTGTGTCAAAGATGAAAACAGCTGAAGGATGACTTAAAGATCCAGCTGAATGGTATCATTTACCGCTGTACTCTGGGACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCGCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAANTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCACCAGCAAAGGGATTCATTACTTGTCCTTCAGTTTCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 4 305 1351 1351 CATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCRCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAARTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCACCAGCAAAGGGATTCATTACTTGTCCTTCAGTTTCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGGCAGGGGGGGCTGCAAGGTTATTGTAGTGAGGGCACAGTATTGTCACCCTTACTTTTGTGCTGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN 1 0 464 464 2 0 464 464 3 0 464 464 4 0 464 464 1 236 570 960 2 0 570 960 3 339 570 960 4 0 1241 960 1 447 873 873 2 247 873 873 3 0 873 873 4 651 873 873 1 0 412 412 2 0 235 412 3 0 235 412 4 0 235 412 1 0 810 810 1 0 1239 960 4 0 589 960 2 0 247 839 3 0 839 839 4 0 247 839 1 359 577 577 2 0 577 577 4 14 577 577 1 0 230 841 1 620 841 841 2 0 417 841 2 620 841 841 3 0 841 841 4 0 841 841 1 0 237 237 2 0 237 237 3 0 237 237 4 0 237 237 1 0 498 498 2 0 234 498 3 0 234 498 4 0 234 498 2 0 300 300 1 0 240 385 2 0 240 385 3 0 385 385 4 0 240 385 1 0 523 561 2 0 249 561 3 0 561 561 4 0 249 561 1 191 420 420 2 183 420 420 4 0 420 420 2 0 259 259 3 0 259 259 4 0 259 259 1 0 253 253 2 0 253 253 3 0 253 253 4 0 253 253 1 0 246 246 2 0 246 246 3 0 246 246 4 0 246 246 1 224 568 568 2 342 568 568 3 0 568 568 4 224 568 568 1 0 226 227 2 0 226 227 3 0 226 227 4 0 227 227 1 49 286 286 2 49 286 286 3 0 286 286 4 49 286 286 1 0 228 228 2 0 228 228 3 0 228 228 4 0 228 228 1 0 553 554 2 0 554 554 3 0 241 554 4 0 554 554 1 187 715 715 2 0 715 715 3 187 715 715 4 187 715 715 1 0 236 236 2 0 236 236 3 0 236 236 4 0 236 236 1 0 473 473 1 299 766 766 2 0 766 766 3 0 766 766 4 0 766 766 1 0 247 247 3 0 247 247 4 123 247 247
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-34-bf84d231351c> in <module>() ----> 1 remote_build_ref_clusters(self) <ipython-input-33-22b5bd623ae0> in remote_build_ref_clusters(self) 66 print("") 67 clust = [] ---> 68 avars = refvars(arr.view(np.uint8), PSEUDO_REF) 69 dat = b"".join(avars.view("S1")[:, 0]).decode() 70 snpstring = "".join(["*" if i else " " for i in avars[:, 1]]) ValueError: cannot assign slice from input of different size
"use bedtools to pull in clusters and match catgs with alignments"
cmd1 = [
ipyrad.bins.bedtools,
"bamtobed",
"-i",
os.path.join(
data.dirs.across,
"{}.cat.sorted.bam".format(self.data.name)
)
]
" ".join(cmd1)
'/home/deren/local/src/ipyrad/bin/bedtools-linux-x86_64 bamtobed -i /home/deren/local/src/ipyrad/tests/6-tortas/test_across/test.cat.sorted.bam'
cmd2 = [
ipyrad.bins.bedtools,
"merge",
"-d", "0",
"-i", "-",
]
" ".join(cmd2)
'/home/deren/local/src/ipyrad/bin/bedtools-linux-x86_64 merge -d 0 -i -'
self.remote_build_ref_clusters()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-35-94508fef440e> in <module>() ----> 1 self.remote_build_ref_clusters() ~/local/src/ipyrad/ipyrad/assemble/cluster_across.py in remote_build_ref_clusters(self) 594 arr = np.zeros((len(rdict), len(read.seq)), dtype=bytes) 595 for idx, key in enumerate(keys): --> 596 arr[idx] = list(rdict[key]) 597 598 # get consens seq and variant site index ValueError: cannot copy sequence with size 288 to array axis with dimension 371
test.run("6", ipyclient=ipyclient)
Assembly: test [####################] 100% 0:00:00 | concatenating bams | s6 | [####################] 100% 0:00:00 | building clusters | s6 | Encountered an unexpected error (see ./ipyrad_log.txt) Error message is below ------------------------------- cannot copy sequence with size 288 to array axis with dimension 371
test2 = test.branch("test2")
from ipyrad.assemble.consens_se import *
self = Step5(test2, True, ipyclient)
self.remote_calculate_depths()
self.remote_make_chunks()
[####################] 100% 0:00:16 | calculating depths | s5 | [####################] 100% 0:00:37 | chunking clusters | s5 |
statsdicts = self.remote_process_chunks()
[####################] 100% 0:14:53 | consens calling | s5 |
sample = self.samples[0]
data = self.data
# write sequences to SAM file
combs1 = glob.glob(os.path.join(
data.tmpdir,
"{}_tmpcons.*".format(sample.name)))
combs1.sort(key=lambda x: int(x.split(".")[-1]))
# open sam handle for writing to bam
samfile = sample.files.consens.replace(".bam", ".sam")
with open(samfile, 'w') as outf:
# parse fai file for writing headers
fai = "{}.fai".format(data.paramsdict["reference_sequence"])
fad = pd.read_csv(fai, sep="\t", names=["SN", "LN", "POS", "N1", "N2"])
headers = ["@HD\tVN:1.0\tSO:coordinate"]
headers += [
"@SQ\tSN:{}\tLN:{}".format(i, j)
for (i, j) in zip(fad["SN"], fad["LN"])
]
outf.write("\n".join(headers) + "\n")
# write to file with sample names imputed to line up with catg array
counter = 0
for fname in combs1:
with open(fname) as infile:
# impute catg ordered seqnames
data = infile.readlines()
fdata = []
for line in data:
name, chrom, rest = line.rsplit(":", 2)
fdata.append(
"{}_{}:{}:{}".format(name, counter, chrom, rest)
)
counter += 1
outf.write("".join(fdata) + "\n")
cmd = [ip.bins.samtools, "view", "-Sb", samfile]
with open(sample.files.consens, 'w') as outbam:
proc = sps.Popen(cmd, stderr=sps.PIPE, stdout=outbam)
comm = proc.communicate()[1]
if proc.returncode:
raise IPyradError("error in samtools: {}".format(comm))
--------------------------------------------------------------------------- IPyradError Traceback (most recent call last) <ipython-input-12-64d33fc863e2> in <module>() 4 comm = proc.communicate()[1] 5 if proc.returncode: ----> 6 raise IPyradError("error in samtools: {}".format(comm)) IPyradError: error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\n[W::sam_read1] parse error at line 10690\n[main_samview] truncated file.\n'
368852-370424
-1572
self.remote_concatenate_chunks()
self.data_store(statsdicts)
[####################] 100% 0:01:20 | indexing alleles | s5 |
--------------------------------------------------------------------------- IPyradError Traceback (most recent call last) <ipython-input-8-b7d045fddf04> in <module>() ----> 1 self.remote_concatenate_chunks() 2 self.data_store(statsdicts) ~/local/src/ipyrad/ipyrad/assemble/consens_se.py in remote_concatenate_chunks(self) 315 if not job.successful(): 316 ip.logger.error("error in step 5: %s", job.exception()) --> 317 raise IPyradError(job.exception()) 318 319 IPyradError: IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\n[W::sam_read1] parse error at line 10690\n[main_samview] truncated file.\n')
shutil.rmtree(self.data.tmpdir)
self.data.save()
from ipyrad.assemble.cluster_across import *
self = Step6(test, True, ipyclient)
self.remote_concat_bams()
[####################] 100% 0:00:56 | concatenating bams | s6 |
self.remote_build_ref_regions()
[####################] 100% 0:00:13 | building clusters | s6 |
self.regions[:10]
[('Contig0', 3480, 3747), ('Contig0', 24390, 24761), ('Contig0', 24903, 25488), ('Contig0', 30442, 30712), ('Contig0', 33784, 34013), ('Contig0', 37975, 38628), ('Contig0', 40387, 40730), ('Contig0', 41782, 42162), ('Contig0', 44567, 44783), ('Contig0', 47550, 47763)]
# access reads from bam file using pysam
bamfile = AlignmentFile(
os.path.join(
self.data.dirs.across,
"cat.sorted.bam"),
'rb')
# catcons output file for raw clusters and db samples
outfile = gzip.open(
os.path.join(
self.data.dirs.across,
"{}.catcons.gz".format(self.data.name)),
'wb')
# write header line to catcons with sample names
snames = ['ref'] + sorted([i.name for i in self.samples])
outfile.write(
b" ".join([i.encode() for i in snames]) + b"\n")
for region in self.regions[:100]:
reads = bamfile.fetch(*region)
# get reference sequence
refn, refs = get_ref_region(
self.data.paramsdict["reference_sequence"],
region[0], region[1] + 1, region[2] + 1
)
# build cluster dict for sorting
rdict = {}
for read in reads:
rdict[read.qname] = read.seq
keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
# make an empty array
arr = np.zeros((len(snames), len(refs)), dtype=bytes)
# fill the array
arr[0] = list(refs)
for idx, key in enumerate(keys):
# get start and stop from this seq
sname = key.rsplit("_", 1)[0]
rstart, rstop = key.rsplit(":", 2)[-1].split("-")
sidx = snames.index(sname)
# get start and stop relative to reference region
start = int(rstart) - int(region[1]) - 1
stop = start + int(rstop) - int(rstart)
print(key)
print(rstart, rstop, start, stop, len(rdict[key]), len(refs))
arr[sidx, start: stop] = list(rdict[key])
print("")
arr[arr == b""] = b"N"
clusts.append(b"\n".join([b"".join(line) for line in arr]))
AGO09concat_0:1:3481-3748 3481 3748 0 267 267 267 PZ03concat_0:1:3481-3748 3481 3748 0 267 267 267 LP08concat_0:1:24391-24679 24391 24679 0 288 288 371 PBR_B125concat_0:1:24391-24762 24391 24762 0 371 371 371 AGO09concat_1:1:24904-25489 24904 25489 0 585 585 585 PBR_B125concat_1:1:25020-25249 25020 25249 116 345 229 585 PZ03concat_1:1:30443-30713 30443 30713 0 270 270 270 AGO09concat_2:1:33785-34014 33785 34014 0 229 229 229 LP08concat_1:1:33785-34014 33785 34014 0 229 229 229 PBR_B125concat_2:1:33785-34014 33785 34014 0 229 229 229 PZ03concat_2:1:33785-34014 33785 34014 0 229 229 229 PZ03concat_3:1:37976-38629 37976 38629 0 653 653 653 AGO09concat_3:1:40396-40633 40396 40633 8 245 237 343 LP08concat_2:1:40388-40633 40388 40633 0 245 245 343 PBR_B125concat_3:1:40396-40633 40396 40633 8 245 237 343 PZ03concat_4:1:40388-40731 40388 40731 0 343 343 343 AGO09concat_4:1:41783-42163 41783 42163 0 380 380 380 LP08concat_3:1:41920-42163 41920 42163 137 380 243 380 PBR_B125concat_4:1:41920-42163 41920 42163 137 380 243 380 PZ03concat_5:1:41920-42163 41920 42163 137 380 243 380 AGO09concat_5:1:44568-44784 44568 44784 0 216 216 216 LP08concat_4:1:44568-44784 44568 44784 0 216 216 216 PZ03concat_6:1:44568-44784 44568 44784 0 216 216 216 PBR_B125concat_5:1:47551-47764 47551 47764 0 213 213 213 AGO09concat_6:1:47992-48229 47992 48229 0 237 237 237 PBR_B125concat_6:1:49479-50005 49479 50005 0 526 526 526 AGO09concat_7:1:56089-56333 56089 56333 111 355 244 355 LP08concat_5:1:56089-56333 56089 56333 111 355 244 355 PBR_B125concat_7:1:56089-56333 56089 56333 111 355 244 355 PZ03concat_7:1:55978-56333 55978 56333 0 355 355 355 AGO09concat_8:1:62284-62501 62284 62501 28 245 217 457 LP08concat_6:1:62284-62501 62284 62501 28 245 217 457 PBR_B125concat_8:1:62284-62501 62284 62501 28 245 217 457 PZ03concat_8:1:62256-62713 62256 62713 0 457 457 457 PZ03concat_9:1:64734-65000 64734 65000 0 266 266 266 AGO09concat_9:1:68597-68887 68597 68887 829 1119 290 1301 LP08concat_7:1:68597-69007 68597 69007 829 1239 410 1301 PBR_B125concat_9:1:68597-69069 68597 69069 829 1301 472 1301 PZ03concat_10:1:67768-68840 67768 68840 0 1072 960 1301
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-95-4ea82413ea79> in <module>() 50 print(key) 51 print(rstart, rstop, start, stop, len(rdict[key]), len(refs)) ---> 52 arr[sidx, start: stop] = list(rdict[key]) 53 print("") 54 ValueError: cannot copy sequence with size 960 to array axis with dimension 1072
# avars = refvars(arr.view(np.uint8), PSEUDO_REF)
# dat = b"".join(avars.view("S1")[:, 0]).decode()
# snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
# cname = "ref_{}:{}-{}\n{}".format(*region, dat)
# cname
67768 -68840
-1072
clusts.append(b"\n".join([b"".join(line) for line in arr]))
b'AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTCACTGTCCAGCAATAGTCTGCAAGCATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATGTTGCAACCAAGGCTTTTGTATGCCTTGAGGAGGTTTTCCACCAACAACCTGTAGTTGTCTGCCTTGTTGTTTCCGAGAAAATT\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTSAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTTTATTCATTAAATGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGATAGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGGAATCGCTCGCCATGNNNNNCGCTCACTGCTCCGCAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nAATTTATTATGAAACAAAAGGCNAAAAACNATTATGTACATANTTTAGTCCTATTCAGTGTCTACTCAGCNCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGGATCTCTTGTCNNNNNNNNNNNNNNNNNNNNNNNNATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCNNNNNNNNCGCTCACTGCTCNGCAGTTCNGTGGAAAAAAATCTNNATGAGAGTGCAAAAAATGTATCTTTAGTGACATGTTGCAACCAAGGCTTTTGTATGCCTTGAGAAGGTTTTCCACCAACAACCTGTAGTTGTCTGCCTTGTTGTTTCCGAGAAAATT\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'
outfile.close()
test.run('6')
Assembly: test [####################] 100% 0:00:56 | concatenating bams | s6 | Encountered an unexpected error (see ./ipyrad_log.txt) Error message is below ------------------------------- cannot copy sequence with size 288 to array axis with dimension 371
## do cigar building again, this time with orientations
data = self.data
sample = list(data.samples.values())[0]
sample.name
'AGO02concat'
# get all regions with reads. Generator to yield (str, int, int)
fullregions = bedtools_merge(data, sample).strip().split("\n")
regions = (i.split("\t") for i in fullregions)
regions = [(i, int(j), int(k)) for (i, j, k) in regions]
regs = regions[:20]
# access reads from bam file using pysam
bamfile = AlignmentFile(
os.path.join(
data.dirs.refmapping,
"{}-mapped-sorted.bam".format(sample.name)),
'rb')
# iterate over all regions
opath = os.path.join(
data.dirs.clusts, "{}.alt.clustS.gz".format(sample.name))
out = gzip.open(opath, 'wt')
idx = 0
clusters = []
for reg in regs:
# uncomment and compare against ref sequence when testing
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = bamfile.fetch(*reg)
# store reads in a dict
rdict = {}
# paired-end data cluster building
if "pair" in data.paramsdict["datatype"]:
# match paired reads together in a dictionary
for read in reads:
print("\t".join(
str(i) for i in
[read.is_read1, read.is_reverse,
read.seq[:6], read.seq[-6:],
*reg
]))
if read.qname not in rdict:
rdict[read.qname] = [read, None]
else:
rdict[read.qname][1] = read
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
clust = []
clust.append("{}\n{}".format('ref', ref[1]))
for key in keys:
r1, r2 = rdict[key]
if r1 and r2:
#lref = len(ref[1])
aref = np.array(list(ref[1]))
lref = reg[2] - reg[1]
arr1 = np.zeros(lref, dtype="U1")
arr2 = np.zeros(lref, dtype="U1")
arr1.fill("-")
arr2.fill("-")
# how far ahead of the start does this read begin
seq = cigared(r1.seq, r1.cigar)
start = r1.reference_start - reg[1]
arr1[start:start + len(seq)] = list(seq)
seq = cigared(r2.seq, r2.cigar)
start = r2.reference_start - reg[1]
arr2[start:start + len(seq)] = list(seq)
arr3 = join_arrays(arr1, arr2)
pairseq = "".join(arr3)
ori = "+"
if r1.is_reverse:
ori = "-"
derep = r1.qname.split("=")[-1]
rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
clust.append("{}\n{}".format(rname, pairseq))
# single-end data cluster building
else:
for read in reads:
rdict[read.qname] = read
# sort keys by derep number
keys = sorted(
rdict.keys(),
key=lambda x: int(x.split("=")[-1]), reverse=True)
# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
r1 = rdict[key]
#aref = np.array(list(ref[1]))
lref = reg[2] - reg[1]
arr1 = np.zeros(lref, dtype="U1")
arr1.fill("-")
# how far ahead of the start does this read begin
seq = cigared(r1.seq, r1.cigar)
start = r1.reference_start - reg[1]
arr1[start:start + len(seq)] = list(seq)
aseq = "".join(arr1)
ori = "+"
if r1.is_reverse:
ori = "-"
derep = r1.qname.split("=")[-1]
rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
clust.append("{}\n{}".format(rname, aseq))
# store this cluster
if clust:
clusters.append("\n".join(clust))
idx += 1
# if 1000 clusters stored then write to disk
if not idx % 10000:
if clusters:
out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
clusters = []
# write final remaining clusters to disk
if clusters:
out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
out.close()
True False CATGAG CTACAG Contig0 3481 3748 True False CATGAG CTACAG Contig0 3481 3748 False True TTCCAG AGAATT Contig0 3481 3748 False True TTCCAG AGAATT Contig0 3481 3748 True False CATGGT GGACAC Contig0 4940 5088 False True GAGGGA GCAATT Contig0 4940 5088 False False AATTGG GGAGTT Contig0 5481 5571 True True CATTTT GGCATG Contig0 5481 5571 False False AATTGC GGGATT Contig0 6267 6397 True True ATCCGG CCCATG Contig0 6267 6397 True False CATGAT TGTGAA Contig0 14907 15026 False True CGCCCC AGAATT Contig0 14907 15026 False False AATTCT CTACAG Contig0 16737 16933 True True GCAGGA GGCATG Contig0 16737 16933 True False CATGAA CAGGCA Contig0 19897 20031 False True CTGAGT TAAATT Contig0 19897 20031 True False CATGGA TGTTGC Contig0 23715 23823 False True CCCTTG CAAATT Contig0 23715 23823 False False AATTTA ACTTCT Contig0 24391 24679 False False AATTAT AATGGA Contig0 24391 24679 False False AATTTA CTTGTC Contig0 24391 24679 True True ATTGAT GCCATG Contig0 24391 24679 True True CGCTCA GACATG Contig0 24391 24679 True True CGCTCA GACATG Contig0 24391 24679 False False AATTGA AGTCTC Contig0 25290 25447 False False AATTGA AGTCTC Contig0 25290 25447 True True GATCGT TCCATG Contig0 25290 25447 True True GATTGT TCCATG Contig0 25290 25447 False False AATTTA TCTTAA Contig0 26192 26308 True True TATGTT CACATG Contig0 26192 26308 True False CATGTT CATAGC Contig0 28528 28669 False True CATAGC AAAATT Contig0 28528 28669 True False CATGCC GTTTTA Contig0 29084 29217 False True CGATCT GCAATT Contig0 29084 29217 True False CATGAG AAAAAT Contig0 29454 29541 False True GGGGTT TCAATT Contig0 29454 29541 False False AATTCC CTTCCT Contig0 30443 30713 True True TGGTAG CACATG Contig0 30443 30713 False False AATTAG ATTAAT Contig0 33785 34014 False False AATTAG ATTAAT Contig0 33785 34014 True True ATAGGC TGCATG Contig0 33785 34014 True True ATAGGC TGCATG Contig0 33785 34014 False False AATTTG TTAGCC Contig0 34452 34659 True True GGCAGC CTCATG Contig0 34452 34659 False False AATTTC CGTAGA Contig0 34452 34659 True True TCTAAT AACATG Contig0 34452 34659 False False AATTAA TTTCTT Contig0 37976 38247 False False AATTAA TTTCTT Contig0 37976 38247 True True GAGGCT CCCATG Contig0 37976 38247 True True GAGGCT CCCATG Contig0 37976 38247 True False CATGGA CATTGC Contig0 38736 38961 False True GGTCGG GCAATT Contig0 38736 38961 True False CATGCA GTGGTA Contig0 40396 40633 True False CATGCA GTGGTA Contig0 40396 40633 True False CATACA GTGGCA Contig0 40396 40633 False True GCTTGT CAAATT Contig0 40396 40633 False True GCTTGT CAAATT Contig0 40396 40633 False True GCTTGT CAAATT Contig0 40396 40633
## branch to reduce size
## run step 6 until building clusters and check orientations
self.remote_index_refs()
self.remote_run(
function=concat_multiple_edits,
printstr=("concatenating ", "s3"),
args=(),
)
self.remote_run(
function=merge_end_to_end,
printstr=("join unmerged pairs ", "s3"),
args=(False, False,),
)
self.remote_run(
function=dereplicate,
printstr=("dereplicating ", "s3"),
args=(self.nthreads,),
threaded=True,
)
self.remote_run(
function=split_endtoend_reads,
printstr=("splitting dereps ", "s3"),
args=(),
)
self.remote_run(
function=mapping_reads,
printstr=("mapping reads ", "s3"),
args=(self.nthreads,),
threaded=True,
)
data.run("123")
Assembly: tortas [####################] 100% 0:16:40 | loading reads | s1 | [####################] 100% 0:00:05 | processing reads | s2 | found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt found an error in step2; see ipyrad_log.txt skipping samples not in state==2: ['AGO02concat', 'AGO08concat', 'AGO09concat', 'AGO11concat', 'AGO12concat', 'AGO14concat', 'AGO18concat', 'AGO20concat', 'AGO23concat', 'AGO32concat', 'CAZ05concat', 'CAZ06concat', 'CAZ07concat', 'CAZ11concat', 'CAZ13concat', 'CAZ14concat', 'CAZ15concat', 'CF01concat', 'CF03concat', 'CF05concat', 'CF09concat', 'CF15concat', 'CF16concat', 'CRU02concat', 'CRU12concat', 'CRU14concat', 'CRU23concat', 'CRU38concat', 'CRU41concat', 'CRU43concat', 'CRU54concat', 'CRU61concat', 'CRU64concat', 'ESP01concat', 'ESP02concat', 'ESP04concat', 'ESP08concat', 'ESP09concat', 'ESP10concat', 'ESP11concat', 'ESP12concat', 'ESP13concat', 'ESP14concat', 'LP01concat', 'LP02concat', 'LP06concat', 'LP08concat', 'LT02concat', 'LT05concat', 'LT06concat', 'LT11concat', 'PBL_H01concat', 'PBL_H126concat', 'PBL_H27concat', 'PBL_H60concat', 'PBL_H86concat', 'PBL_I143concat', 'PBL_I188concat', 'PBR_B125concat', 'PBR_B152concat', 'PBR_D49concat', 'PBR_E18concat', 'PBR_F89concat', 'PBR_G244concat', 'PZ03concat', 'PZ04concat', 'PZ05concat', 'PZ06concat', 'PZ08concat', 'PZ09concat', 'PZ28concat', 'PZ67concat', 'PZ70concat', 'PZ82concat', 'SCR02concat', 'SCR05concat', 'SCR06concat', 'SCR07concat', 'SCR08concat', 'SCR10concat', 'SCR21concat', 'SCR22concat', 'SCR24concat', 'VA05concat', 'VA07concat', 'VA08concat', 'VA1082concat', 'VA373concat', 'VA378concat', 'VA387concat', 'VA423concat', 'VA984concat', 'VD02concat', 'VD05concat', 'VD06concat', 'VD08concat', 'VD13concat', 'VD14concat', 'VD15concat', 'VD23concat', 'VD24concat', 'VD25concat'] Encountered an unexpected error (see ./ipyrad_log.txt) Error message is below ------------------------------- no samples ready for step 3
step.remote_concat_bams()
step.remote_build_ref_regions()
self = step
regs = [next(self.regions) for i in range(10)]
regs
[('Contig0', 76479, 77014), ('Contig0', 79123, 79378), ('Contig0', 81338, 81851), ('Contig0', 82205, 82795), ('Contig0', 83142, 83536), ('Contig0', 95644, 96211), ('Contig0', 101552, 101787), ('Contig0', 103468, 103708), ('Contig0', 107537, 108114), ('Contig0', 132703, 133571)]
# access reads from bam file using pysam
bamfile = AlignmentFile(
os.path.join(
self.data.dirs.across,
"cat.sorted.bam"),
'rb')
# catcons output file for raw clusters and db samples
outfile = gzip.open(
os.path.join(
self.data.dirs.across,
"{}.catcons.gz".format(self.data.name)),
'wb')
# write header line to catcons with sample names
outfile.write(
b" ".join([i.name.encode() for i in self.samples]) + b"\n")
# get clusters
lidx = 0
clusts = []
# while 1:
# try:
# region = next(self.regions)
# reads = bamfile.fetch(*region)
# except StopIteration:
# break
for region in regs:
reads = bamfile.fetch(*region)
# get reference
refn, refs = get_ref_region(
data.paramsdict["reference_sequence"],
region[0], region[1]+1, region[2]+1)
# build cluster dict for sorting
rdict = {}
for read in reads:
print(read)
rdict[read.qname] = read.seq
keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
# build cluster based on map positions (reads are already oriented)
arr = np.zeros((len(rdict) + 1, len(refs)), dtype=bytes)
arr[0] = list(refs)
for idx, key in enumerate(keys):
rstart, rstop = key.rsplit(":", 1)[-1].split("-")
start = max(0, region[1] - (int(rstart) - 1))
stop = min(len(refs), int(rstop) - int(rstart))
#print(start, stop, len(rdict[key]), refn)
#print(rdict[key])
arr[idx + 1, int(start): int(stop)] = list(rdict[key])
arr[arr == b""] = b"N"
for line in arr:
outfile.write(line.tostring() + b"\n")
#print(line.tostring(), file=outfile)
#print(b"".join(line), file=outfile)
outfile.write(b"\n")
outfile.close()
AGO09concat_10:1:76480-76715 0 0 76479 0 235M -1 -1 235 CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAAYAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATTTCAG None [] AGO08concat_10:1:76480-77015 0 0 76479 0 535M -1 -1 535 CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAGAATCTCTATAAGCACTATAGGGCTCATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTAACCAGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT None [] AGO11concat_10:1:76480-76711 0 0 76479 0 231M -1 -1 231 CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCNGCCATCTATCTACAATATGTATCTAAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT None [] AGO02concat_9:1:76480-76711 0 0 76479 0 231M -1 -1 231 CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAAYAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT None [] AGO09concat_11:1:79124-79379 0 0 79123 0 255M -1 -1 255 CATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGTGCTGGNCAAATGGTCGAAGACTTGTTCCAACCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT None [] AGO09concat_12:1:81339-81852 0 0 81338 0 513M -1 -1 513 CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACKTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG None [] AGO08concat_11:1:81339-81852 0 0 81338 0 513M -1 -1 513 CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGYTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG None [] AGO11concat_11:1:81339-81852 0 0 81338 0 513M -1 -1 513 CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCTGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG None [] AGO02concat_10:1:81339-81852 0 0 81338 0 513M -1 -1 513 CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCNGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG None [] AGO08concat_12:1:82206-82699 0 0 82205 0 493M -1 -1 493 CATGCAATGCAACAAGGTCTGGAGCTCCTTAGCGAGCCTTCGAGCCACCCAGTCCCTGAAATACACCCCCTGGTCCCTTTCAGGCTGGATCCAGAATGGAAAGTGTAACACCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG None [] AGO09concat_13:1:82469-82796 0 0 82468 0 327M -1 -1 327 AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGTGCTCATCCAGGCAGGAGCAAACCCTGAGCCCCGGCCACAGGTTCAAAGGTCTCAGCACCAGAGACAGGTGGAAAACCTTTTTCCCGTCCCGCAATT None [] AGO11concat_12:1:82469-82699 0 0 82468 0 230M -1 -1 230 AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG None [] AGO02concat_11:1:82469-82699 0 0 82468 0 230M -1 -1 230 AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG None [] AGO09concat_14:1:83143-83367 0 0 83142 0 224M -1 -1 224 AGAAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCRGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG None [] AGO11concat_13:1:83146-83367 0 0 83145 0 221M -1 -1 221 AATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCGGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG None [] AGO02concat_12:1:83146-83537 0 0 83145 0 391M -1 -1 391 AATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCRGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATGGCCAGCAAGGCTGGTCCATAGGGGCTGCAGGGCACGGCATTATGTCCTCCGGGGAACTTCTGCAGCGNNNNNNNNNNNNNNNNNNNNNNNNNNNTATTGCCATCCCCGCCTCTGGAACGGGACATAAAACCGGCTAGCTGGCAGGATGTCCTGAGCGCCCTGGCAGAATT None [] AGO09concat_15:1:95645-95887 0 0 95644 0 242M -1 -1 242 AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG None [] AGO08concat_13:1:95645-95887 0 0 95644 0 242M -1 -1 242 AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTASGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG None [] AGO11concat_14:1:95645-96212 0 0 95644 0 567M -1 -1 567 AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATGGAGGTAAGGAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAAACCTCACAACAGCCTTGAAATCTAGTTCCATAGTCTCGAGCCCTTTGTACAAATAGGTATCCAGAAGCACAGAAACCTCAAGTGACTTGTCTCAGGTCACACCCGTATGGCACTGATGTAAGTGAACATACAGCATG None [] AGO02concat_13:1:95645-96212 0 0 95644 0 567M -1 -1 567 AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAAACCTCACAACAGCCTTGAAATCTAGTTCCATCGTCTCGAGCCCTTTGTACAAATAGGTATCCAGAAGCACAGAAACCTCAAGTGACTTGTCTCAGGTCACACCCGTATGGCACTGATGTAAGTGAACATACAGCATG None [] AGO09concat_16:1:101553-101788 0 0 101552 0 235M -1 -1 235 CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGNCTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT None [] AGO08concat_14:1:101553-101788 0 0 101552 0 235M -1 -1 235 CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT None [] AGO11concat_15:1:101553-101788 0 0 101552 0 235M -1 -1 235 CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT None [] AGO02concat_14:1:101553-101788 0 0 101552 0 235M -1 -1 235 CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT None [] AGO09concat_17:1:103469-103709 0 0 103468 0 240M -1 -1 240 CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT None [] AGO08concat_15:1:103469-103709 0 0 103468 0 240M -1 -1 240 CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT None [] AGO11concat_16:1:103469-103709 0 0 103468 0 240M -1 -1 240 CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT None [] AGO02concat_15:1:103469-103709 0 0 103468 0 240M -1 -1 240 CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT None [] AGO02concat_16:1:107538-108115 0 0 107537 0 577M -1 -1 577 CATGTTTAATCCTGAGAAGAGAAGACCAAGGGGGGACCCAATAACAGTCTTCAAATCTGTTAAGGGCTGTTATAGACAGGACGGTGATCAATTNNNNNNCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGGAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT None [] AGO09concat_18:1:107637-108115 0 0 107636 0 478M -1 -1 478 CATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGTAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT None [] AGO08concat_16:1:107637-108115 0 0 107636 0 478M -1 -1 478 CATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT None [] AGO11concat_17:1:107637-108115 0 0 107636 0 478M -1 -1 478 CATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGTAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT None [] AGO09concat_19:1:132704-133572 0 0 132703 0 868M -1 -1 868 CATGTATTAAACTTGAACACCTGCCCTTGCACTAGTAGGATTTTATGTTTGTATTTTGCATAACTTAGAATAAATAATAATGATATGATTTAACCGTATCCTGCCAGCCACCTTGGTAGAAATGCATCAGCCAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAATTCTGTTTTGTGTGNGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCNGTAATCAAAACCAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAAACTTTGGGTTCATCCTGCCAAGACTTCCCCAGAGCGTAGTGTCGTGATCGACAGAACCCAGCTCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACTCTGGACTAGTAACTATAACATCGACTGGTGGGAGAGTGTGCGTGTGAGTGATTGAATGAATGTGCTAATT None [] AGO02concat_17:1:132704-133221 0 0 132703 0 517M -1 -1 517 CATGTATTAAACTTGAACACCTGCCCTTGCACTAGTAGGATTTTATGTTTGTATTTTGCATAACTTAGAATAAATAATAATGATATGATTTAACCGTATCCTGCCAGCCACCTTGGTAGAAATGCATCAGCCAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG None [] AGO11concat_18:1:132993-133572 0 0 132992 0 579M -1 -1 579 AATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAAACTTTGGGTTCATCCTGCCAAGACTTCCCCAGAGCGTAGTGTCGTGATCGACAGAACCCAGCTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACTCTGGACTAGTAACTATAACATCGACTGGTGGGAGAGTGTGCGTGTGAGTGATTGAATGAATGTGCTAATT None []
region[1]
132703
revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")
'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTTGTATTAGAGATGGGGCTGAAGTCAAACCCTGACCTGAACAGTGAGAAAATCTGGTGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCCTCTGCCTCCCTCAAGAAGAAATCAAGAGKGGAAAGGAGCAGGGCGGGAGGTATGGGAAAGGAAGAATGGAATT'
# get consens seq and variant site index
clust = []
avars = refvars(arr.view(np.uint8), PSEUDO_REF)
dat = b"".join(avars.view("S1")[:, 0]).decode()
snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
clust.append("ref_{}:{}-{}\n{}".format(*region, dat))
# or, just build variant string (or don't...)
# write all loci with [locids, nsnps, npis, nindels, ?]
for key in keys:
clust.append("{}\n{}".format(key, rdict[key]))
clust.append("SNPs\n" + snpstring)
clusts.append("\n".join(clust))
# advance locus counter
lidx += 1
# write chunks to file
if not lidx % 1000:
outfile.write(
str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
clusts = []
# write remaining
if clusts:
outfile.write(
str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-37-7858eebe5047> in <module>() 43 arr = np.zeros((len(rdict), len(refs)), dtype=bytes) 44 for idx, key in enumerate(keys): ---> 45 arr[idx] = list(rdict[key]) 46 47 # get consens seq and variant site index ValueError: cannot copy sequence with size 231 to array axis with dimension 535
step.remote_build_ref_clusters()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-16-2a0e5422841e> in <module>() ----> 1 step.remote_build_ref_clusters() ~/Documents/ipyrad/ipyrad/assemble/cluster_across.py in remote_build_ref_clusters(self) 588 arr = np.zeros((len(rdict), len(read.seq)), dtype=bytes) 589 for idx, key in enumerate(keys): --> 590 arr[idx] = list(rdict[key]) 591 592 # get consens seq and variant site index ValueError: cannot copy sequence with size 543 to array axis with dimension 559
from ipyrad.assemble.clustmap import *
self = Step3(data, 8, True, ipyclient)
self.run()
[####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:05:42 | join unmerged pairs | s3 | [####################] 100% 0:02:17 | dereplicating | s3 | [####################] 100% 0:00:39 | splitting dereps | s3 | [####################] 100% 1:53:03 | mapping reads | s3 | [####################] 100% 1:40:12 | building clusters | s3 | [####################] 100% 0:00:24 | calc cluster stats | s3 |
self.data.run("45")
Assembly: 5-tortas [####################] 100% 0:15:41 | inferring [H, E] | s4 | [####################] 100% 0:00:26 | calculating depths | s5 | [####################] 100% 0:00:45 | chunking clusters | s5 | [####################] 100% 16:10:59 | consens calling | s5 | [####################] 100% 0:01:31 | indexing alleles | s5 |
self.remote_index_refs()
self.remote_run(
function=concat_multiple_edits,
printstr=("concatenating ", "s3"),
args=(),
)
self.remote_run(
function=merge_end_to_end,
printstr=("join unmerged pairs ", "s3"),
args=(False, False,),
)
[####################] 100% 0:00:00 | indexing reference | s3 | [####################] 100% 0:00:00 | concatenating | s3 | [####################] 100% 0:05:49 | join unmerged pairs | s3 |
self.remote_run(
function=dereplicate,
printstr=("dereplicating ", "s3"),
args=(self.nthreads,),
threaded=True,
)
[####################] 100% 0:02:22 | dereplicating | s3 |
self.remote_run(
function=split_endtoend_reads,
printstr=("splitting dereps ", "s3"),
args=(),
)
[####################] 100% 0:00:36 | splitting dereps | s3 |
self.remote_run(
function=mapping_reads,
printstr=("mapping reads ", "s3"),
args=(self.nthreads,),
threaded=True,
)
[########## ] 50% 1:18:01 | mapping reads | s3 |
sample = list(self.data.samples.values())[0]
merge_end_to_end(self.data, sample, True, True)
sample = list(data.samples.values())[0]
infiles = [
os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]
infile
'/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq'
strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
strand = "both"
nthreads=2
cmd = [
ip.bins.vsearch,
"--derep_fulllength", infile,
"--strand", strand,
"--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
"--threads", str(nthreads),
"--fasta_width", str(0),
"--fastq_qmax", "1000",
"--sizeout",
"--relabel_md5",
]
cmd
['/home/deren/Documents/ipyrad/bin/vsearch-linux-x86_64', '--derep_fulllength', '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq', '--strand', 'plus', '--output', '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_derep.fastq', '--threads', '2', '--fasta_width', '0', '--fastq_qmax', '1000', '--sizeout', '--relabel_md5']
proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
ip.logger.error("error inside dereplicate %s", errmsg)
raise IPyradWarningExit(errmsg)
s.remote_run(
function=dereplicate,
printstr=("dereplicating ", "s3"),
args=(s.nthreads,),
threaded=True,
)
[ ] 0% 0:01:55 | dereplicating | s3 |
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-10-9074a628ea2e> in <module>() 3 printstr=("dereplicating ", "s3"), 4 args=(s.nthreads,), ----> 5 threaded=True, 6 ) ~/Documents/ipyrad/ipyrad/assemble/clustmap.py in remote_run(self, printstr, function, args, threaded) 598 ready = [rasyncs[i].ready() for i in rasyncs] 599 self.data._progressbar(len(ready), sum(ready), start, printstr) --> 600 time.sleep(0.1) 601 if len(ready) == sum(ready): 602 break KeyboardInterrupt: