Notebook

In [1]:

import ipyrad as ip
import ipyparallel as ipp

In [2]:

ipyclient = ipp.Client()

/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@sacra')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)

In [4]:

data = ip.load_json("6-tortas/test.json")
test2 = data.branch("test2")
#test2.run("5", force=True)
test2.remote_calculate_depths()
test2.remote_make_chunks()

loading Assembly: test
from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-4-a594ca197022> in <module>()
      2 test2 = data.branch("test2")
      3 #test2.run("5", force=True)
----> 4 test2.remote_calculate_depths()
      5 test2.remote_make_chunks()

AttributeError: 'Assembly' object has no attribute 'remote_calculate_depths'

In [ ]:

In [4]:

self = Step5(test2, True, ipyclient)
self.remote_calculate_depths()
self.remote_make_chunks()

loading Assembly: test
from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json
[####################] 100% 0:00:14 | calculating depths   | s5 |
[####################] 100% 0:00:31 | chunking clusters    | s5 |

In [ ]:

In [26]:

data = test2
sample = data.samples["PZ03concat"]
isref = True
for sample in [sample]:
    chunks = glob.glob(os.path.join(
        data.tmpdir,
        "{}.chunk.*".format(sample.name)))
    chunks.sort(key=lambda x: int(x.split('.')[-1]))
chunkfile = chunks[1]
chunkfile

Out[26]:

'/home/deren/local/src/ipyrad/tests/6-tortas/test2-tmpdir/PZ03concat.chunk.35195.35195'

In [27]:

p = Processor(data, sample, chunkfile, isref)
p.run()

In [30]:

p.counters, p.filters

Out[30]:

({'name': 45206, 'heteros': 1576, 'nsites': 4168046, 'nconsens': 10011},
 {'depth': 25179, 'maxh': 3, 'maxn': 2})

In [9]:

class Processor:
    def __init__(self, data, sample, chunkfile, isref):
        self.data = data
        self.sample = sample
        self.chunkfile = chunkfile
        self.isref = isref

        # prepare the processor
        self.set_params()
        self.init_counters()
        self.init_arrays()        
        self.chroms2ints()

        # run the processor
        #self.run()

    def run(self):
        self.process_chunk()
        self.write_chunk()

    def set_params(self):
        # set max limits
        self.tmpnum = int(self.chunkfile.split(".")[-1])
        self.optim = int(self.chunkfile.split(".")[-2])
        self.este = self.data.stats.error_est.mean()
        self.esth = self.data.stats.hetero_est.mean()
        self.maxlen = self.data._hackersonly["max_fragment_length"]
        if 'pair' in self.data.paramsdict["datatype"]:
            self.maxhet = sum(self.data.paramsdict["max_Hs_consens"])
            self.maxn = sum(self.data.paramsdict["max_Ns_consens"])
        else:
            self.maxhet = self.data.paramsdict["max_Hs_consens"][0]
            self.maxn = self.data.paramsdict["max_Ns_consens"][0]
        # not enforced for ref
        if self.isref:
            self.maxn = int(1e6)
        
    def init_counters(self):
        # store data for stats counters.
        self.counters = {
            "name": self.tmpnum,
            "heteros": 0,
            "nsites": 0,
            "nconsens": 0,
        }

        # store data for what got filtered
        self.filters = {
            "depth": 0,
            "maxh": 0,
            "maxn": 0,
        }

        # store data for writing
        self.storeseq = {}

    def init_arrays(self):
        # local copies to use to fill the arrays
        self.catarr = np.zeros((self.optim, self.maxlen, 4), dtype=np.uint32)
        self.nallel = np.zeros((self.optim, ), dtype=np.uint8)
        self.refarr = np.zeros((self.optim, 3), dtype=np.int64)

    def chroms2ints(self):
        # if reference-mapped then parse the fai to get index number of chroms
        if self.isref:
            fai = pd.read_csv(
                self.data.paramsdict["reference_sequence"] + ".fai",
                names=['scaffold', 'size', 'sumsize', 'a', 'b'],
                sep="\t")
            self.faidict = {j: i for i, j in enumerate(fai.scaffold)}
            self.revdict = {j: i for i, j in self.faidict.items()}

    # ---------------------------------------------
    def process_chunk(self):
        # stream through the clusters
        with open(self.chunkfile, 'rb') as inclust:
            pairdealer = izip(*[iter(inclust)] * 2)
            done = 0
            while not done:
                done, chunk = clustdealer(pairdealer, 1)
                if chunk:  
                    self.parse_cluster(chunk)
                    if self.filter_mindepth():
                        self.build_consens_and_array()
                        self.get_heteros()
                        if self.filter_maxhetero():
                            if self.filter_maxN_minLen():
                                self.get_alleles()
                                self.store_data()


    def parse_cluster(self, chunk):
        # get names and seqs
        piece = chunk[0].decode().strip().split("\n")
        self.names = piece[0::2]
        self.seqs = piece[1::2]

        # pull replicate read info from seqs
        self.reps = [int(n.split(";")[-2][5:]) for n in self.names]

        # ref positions
        self.ref_position = (-1, 0, 0)
        if self.isref:
            # parse position from name string
            rname = self.names[0].rsplit(";", 2)[0]
            chrom, posish = rname.rsplit(":")
            pos0, pos1 = posish.split("-")
            # pull idx from .fai reference dict
            chromint = self.faidict[chrom] + 1
            self.ref_position = (int(chromint), int(pos0), int(pos1))

    def filter_mindepth(self):
        # exit if nreps is less than mindepth setting
        if not nfilter1(self.data, self.reps):
            self.filters['depth'] += 1
            return 0
        return 1

    def build_consens_and_array(self):
        # get stacks of base counts
        sseqs = [list(seq) for seq in self.seqs]
        arrayed = np.concatenate(
            [[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]
        ).astype(bytes)

        # ! enforce maxlen limit !
        self.arrayed = arrayed[:, :self.maxlen]
                    
        # get unphased consens sequence from arrayed
        self.consens = base_caller(
            self.arrayed, 
            self.data.paramsdict["mindepth_majrule"], 
            self.data.paramsdict["mindepth_statistical"],
            self.esth, 
            self.este,
        )

        # trim Ns from the left and right ends
        self.consens[self.consens == b"-"] = b"N"
        trim = np.where(self.consens != b"N")[0]
        ltrim, rtrim = trim.min(), trim.max()
        self.consens = self.consens[ltrim:rtrim + 1]
        self.arrayed = self.arrayed[:, ltrim:rtrim + 1]

        # update position for trimming
        self.ref_position = (
            self.ref_position[0], 
            self.ref_position[1] + ltrim,
            self.ref_position[1] + ltrim + rtrim + 1,
            )

    def get_heteros(self):
        self.hidx = [
            i for (i, j) in enumerate(self.consens) if 
            j.decode() in list("RKSYWM")]
        self.nheteros = len(self.hidx)

    def filter_maxhetero(self):
        if not nfilter2(self.nheteros, self.maxhet):
            self.filters['maxh'] += 1
            return 0
        return 1

    def filter_maxN_minLen(self):
        if not nfilter3(self.consens, self.maxn):
            self.filters['maxn'] += 1
            return 0
        return 1

    def get_alleles(self):
        # if less than two Hs then there is only one allele
        if len(self.hidx) < 2:
            self.nalleles = 1
        else:
            # array of hetero sites
            harray = self.arrayed[:, self.hidx]
            # remove reads with - or N at variable site
            harray = harray[~np.any(harray == b"-", axis=1)]
            harray = harray[~np.any(harray == b"N", axis=1)]
            # get counts of each allele (e.g., AT:2, CG:2)
            ccx = Counter([tuple(i) for i in harray])
            # remove low freq alleles if more than 2, since they may reflect
            # seq errors at hetero sites, making a third allele, or a new
            # allelic combination that is not real.
            if len(ccx) > 2:
                totdepth = harray.shape[0]
                cutoff = max(1, totdepth // 10)
                alleles = [i for i in ccx if ccx[i] > cutoff]
            else:
                alleles = ccx.keys()
            self.nalleles = len(alleles)

            if self.nalleles == 2:
                try:
                    self.consens = storealleles(self.consens, self.hidx, alleles)
                except (IndexError, KeyError):
                    # the H sites do not form good alleles
                    ip.logger.info("failed at phasing loc, skipping")
                    ip.logger.info("""
                        consens %s
                        hidx %s
                        alleles %s
                        """, self.consens, self.hidx, alleles)
                    # todo return phase or no-phase and store for later
                    # ...

    def store_data(self):
        # current counter
        cidx = self.counters["nconsens"]
        self.nallel[cidx] = self.nalleles
        self.refarr[cidx] = self.ref_position
        # store a reduced array with only CATG
        catg = np.array(
            [np.sum(self.arrayed == i, axis=0) for i in [b'C', b'A', b'T', b'G']],
            dtype='uint16').T
        # do not allow ints larger than 65535 (uint16)
        self.catarr[cidx, :catg.shape[0], :] = catg

        # store the seqdata and advance counters
        self.storeseq[cidx] = b"".join(list(self.consens))
        self.counters["name"] += 1
        self.counters["nconsens"] += 1
        self.counters["heteros"] += self.nheteros

    # ----------------------------------------------
    def write_chunk(self):

        # find last empty size
        end = np.where(np.all(self.refarr == 0, axis=1))[0]
        if np.any(end):
            end = end.min()
        else:
            end = self.refarr.shape[0]

        # write final consens string chunk
        consenshandle = os.path.join(
            self.data.tmpdir,
            "{}_tmpcons.{}.{}".format(self.sample.name, end, self.tmpnum))

        # write chunk. 
        if self.storeseq:
            with open(consenshandle, 'wt') as outfile:

                # denovo just write the consens simple
                if not self.isref:
                    outfile.write(
                        "\n".join(
                            [">" + self.sample.name + "_" + str(key) + \
                            "\n" + self.storeseq[key].decode() 
                            for key in self.storeseq]))

                # reference needs to store if the read is revcomp to reference
                else:
                    outfile.write(
                        "\n".join(
                            ["{}:{}:{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}"
                            .format(
                                self.sample.name,
                                self.refarr[i][0],
                                self.refarr[i][1],
                                self.refarr[i][2],
                                0,
                                self.revdict[self.refarr[i][0] - 1],
                                self.refarr[i][1],
                                0,
                                #"{}M".format(refarr[i][2] - refarr[i][1]),
                                make_allele_cigar(
                                    "".join(['.' if i.islower() else i for i 
                                             in self.storeseq[i].decode()]),
                                    ".",
                                    "S",
                                ),
                                #"{}M".format(len(self.storeseq[i].decode())),
                                "*",
                                0,
                                self.refarr[i][2] - self.refarr[i][1],
                                self.storeseq[i].decode(),
                                "*",
                                # "XT:Z:{}".format(
                                    # make_indel_cigar(storeseq[i].decode())
                                    # ),

                            ) for i in self.storeseq.keys()]
                            ))

        # store reduced size arrays with indexes matching to keep indexes
        tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
        with h5py.File(tmp5, 'w') as io5:
            # reduce catarr to uint16 size for easier storage
            self.catarr[self.catarr > 65535] = 65535
            self.catarr = self.catarr.astype(np.uint16)
            io5.create_dataset(name="cats", data=self.catarr[:end])
            io5.create_dataset(name="alls", data=self.nallel[:end])
            io5.create_dataset(name="chroms", data=self.refarr[:end])
        del self.catarr
        del self.nallel
        del self.refarr

        # return stats
        self.counters['nsites'] = sum([len(i) for i in self.storeseq.values()])
        del self.storeseq

In [ ]:

res = ipyclient.apply(process_chunks, *(data, sample, chunkfile, isref))
res.result()

In [5]:

statsdicts = Processor(data, sample, chunkfile, isref)

In [8]:

statsdicts.counters, statsdicts.filters

Out[8]:

({'name': 10148, 'heteros': 1736, 'nsites': 4258854, 'nconsens': 10148},
 {'depth': 25043, 'maxh': 4, 'maxn': 0})

In [11]:

aa = ipyclient[0].apply(Processor, )

Out[11]:

<__main__.D at 0x7f167c837b70>

In [141]:

# write sequences to SAM file
combs1 = glob.glob(os.path.join(
    data.tmpdir,
    "{}_tmpcons.*".format(sample.name)))
combs1.sort(key=lambda x: int(x.split(".")[-1]))

combs1

Out[141]:

[]

In [138]:

    # write sequences to SAM file
    combs1 = glob.glob(os.path.join(
        data.tmpdir,
        "{}_tmpcons.*".format(sample.name)))
    combs1.sort(key=lambda x: int(x.split(".")[-1]))

    # open sam handle for writing to bam
    samfile = sample.files.consens.replace(".bam", ".sam")    
    with open(samfile, 'w') as outf:

        # parse fai file for writing headers
        fai = "{}.fai".format(data.paramsdict["reference_sequence"])
        fad = pd.read_csv(fai, sep="\t", names=["SN", "LN", "POS", "N1", "N2"])
        headers = ["@HD\tVN:1.0\tSO:coordinate"]
        headers += [
            "@SQ\tSN:{}\tLN:{}".format(i, j)
            for (i, j) in zip(fad["SN"], fad["LN"])
        ]
        outf.write("\n".join(headers) + "\n")

        # write to file with sample names imputed to line up with catg array
        counter = 0
        for fname in combs1:
            with open(fname) as infile:
                # impute catg ordered seqnames 
                data = infile.readlines()
                fdata = []
                for line in data:
                    name, chrom, rest = line.rsplit(":", 2)
                    fdata.append(
                        "{}_{}:{}:{}".format(name, counter, chrom, rest)
                        )
                    counter += 1
                outf.write("".join(fdata) + "\n")

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-138-6cbce1edccb2> in <module>()
----> 1 self.refarr

AttributeError: 'Processor' object has no attribute 'refarr'

In [14]:

class Processor:
    def __init__(self, data, sample, chunkfile, isref):
        self.data = data
        self.sample = sample
        self.chunkfile = chunkfile
        self.isref = isref

        # prepare the processor
        self.set_params()
        self.init_counters()
        self.init_arrays()        
        self.chroms2ints()

    # run the processor
    def run(self):
        self.process_chunk()
        self.write_chunk()

    def set_params(self):
        # set max limits
        self.tmpnum = int(self.chunkfile.split(".")[-1])
        self.optim = int(self.chunkfile.split(".")[-2])
        self.este = self.data.stats.error_est.mean()
        self.esth = self.data.stats.hetero_est.mean()
        self.maxlen = self.data._hackersonly["max_fragment_length"]
        if 'pair' in self.data.paramsdict["datatype"]:
            self.maxhet = sum(self.data.paramsdict["max_Hs_consens"])
            self.maxn = sum(self.data.paramsdict["max_Ns_consens"])
        else:
            self.maxhet = self.data.paramsdict["max_Hs_consens"][0]
            self.maxn = self.data.paramsdict["max_Ns_consens"][0]
        # not enforced for ref
        if self.isref:
            self.maxn = int(1e6)
        
    def init_counters(self):
        # store data for stats counters.
        self.counters = {
            "name": self.tmpnum,
            "heteros": 0,
            "nsites": 0,
            "nconsens": 0,
        }

        # store data for what got filtered
        self.filters = {
            "depth": 0,
            "maxh": 0,
            "maxn": 0,
        }

        # store data for writing
        self.storeseq = {}

    def init_arrays(self):
        # local copies to use to fill the arrays
        self.catarr = np.zeros((self.optim, self.maxlen, 4), dtype=np.uint32)
        self.nallel = np.zeros((self.optim, ), dtype=np.uint8)
        self.refarr = np.zeros((self.optim, 3), dtype=np.int64)

    def chroms2ints(self):
        # if reference-mapped then parse the fai to get index number of chroms
        if self.isref:
            fai = pd.read_csv(
                self.data.paramsdict["reference_sequence"] + ".fai",
                names=['scaffold', 'size', 'sumsize', 'a', 'b'],
                sep="\t")
            self.faidict = {j: i for i, j in enumerate(fai.scaffold)}
            self.revdict = {j: i for i, j in self.faidict.items()}

    # ---------------------------------------------
    def process_chunk(self):
        # stream through the clusters
        with open(self.chunkfile, 'rb') as inclust:
            pairdealer = izip(*[iter(inclust)] * 2)
            done = 0
            while not done:
                done, chunk = clustdealer(pairdealer, 1)
                if chunk:  
                    self.parse_cluster(chunk)
                    if self.filter_mindepth():
                        self.build_consens_and_array()
                        self.get_heteros()
                        if self.filter_maxhetero():
                            if self.filter_maxN_minLen():
                                self.get_alleles()
                                self.store_data()

    def parse_cluster(self, chunk):
        # get names and seqs
        piece = chunk[0].decode().strip().split("\n")
        self.names = piece[0::2]
        self.seqs = piece[1::2]

        # pull replicate read info from seqs
        self.reps = [int(n.split(";")[-2][5:]) for n in self.names]

        # ref positions
        self.ref_position = (-1, 0, 0)
        if self.isref:
            # parse position from name string
            rname = self.names[0].rsplit(";", 2)[0]
            chrom, posish = rname.rsplit(":")
            pos0, pos1 = posish.split("-")
            # pull idx from .fai reference dict
            chromint = self.faidict[chrom] + 1
            self.ref_position = (int(chromint), int(pos0), int(pos1))

    def filter_mindepth(self):
        # exit if nreps is less than mindepth setting
        if not nfilter1(self.data, self.reps):
            self.filters['depth'] += 1
            return 0
        return 1

    def build_consens_and_array(self):
        # get stacks of base counts
        sseqs = [list(seq) for seq in self.seqs]
        arrayed = np.concatenate(
            [[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]
        ).astype(bytes)

        # ! enforce maxlen limit !
        self.arrayed = arrayed[:, :self.maxlen]
                    
        # get unphased consens sequence from arrayed
        self.consens = base_caller(
            self.arrayed, 
            self.data.paramsdict["mindepth_majrule"], 
            self.data.paramsdict["mindepth_statistical"],
            self.esth, 
            self.este,
        )

        # trim Ns from the left and right ends
        self.consens[self.consens == b"-"] = b"N"
        trim = np.where(self.consens != b"N")[0]
        ltrim, rtrim = trim.min(), trim.max()
        self.consens = self.consens[ltrim:rtrim + 1]
        self.arrayed = self.arrayed[:, ltrim:rtrim + 1]

        # update position for trimming
        self.ref_position = (
            self.ref_position[0], 
            self.ref_position[1] + ltrim,
            self.ref_position[1] + ltrim + rtrim + 1,
            )

    def get_heteros(self):
        self.hidx = [
            i for (i, j) in enumerate(self.consens) if 
            j.decode() in list("RKSYWM")]
        self.nheteros = len(self.hidx)

    def filter_maxhetero(self):
        if not nfilter2(self.nheteros, self.maxhet):
            self.filters['maxh'] += 1
            return 0
        return 1

    def filter_maxN_minLen(self):
        if not nfilter3(self.consens, self.maxn):
            self.filters['maxn'] += 1
            return 0
        return 1

    def get_alleles(self):
        # if less than two Hs then there is only one allele
        if len(self.hidx) < 2:
            self.nalleles = 1
        else:
            # array of hetero sites
            harray = self.arrayed[:, self.hidx]
            # remove reads with - or N at variable site
            harray = harray[~np.any(harray == b"-", axis=1)]
            harray = harray[~np.any(harray == b"N", axis=1)]
            # get counts of each allele (e.g., AT:2, CG:2)
            ccx = Counter([tuple(i) for i in harray])
            # remove low freq alleles if more than 2, since they may reflect
            # seq errors at hetero sites, making a third allele, or a new
            # allelic combination that is not real.
            if len(ccx) > 2:
                totdepth = harray.shape[0]
                cutoff = max(1, totdepth // 10)
                alleles = [i for i in ccx if ccx[i] > cutoff]
            else:
                alleles = ccx.keys()
            self.nalleles = len(alleles)

            if self.nalleles == 2:
                try:
                    self.consens = storealleles(self.consens, self.hidx, alleles)
                except (IndexError, KeyError):
                    # the H sites do not form good alleles
                    ip.logger.info("failed at phasing loc, skipping")
                    ip.logger.info("""
                        consens %s
                        hidx %s
                        alleles %s
                        """, self.consens, self.hidx, alleles)

    def store_data(self):
        # current counter
        cidx = self.counters["nconsens"]
        self.nallel[cidx] = self.nalleles
        self.refarr[cidx] = self.ref_position
        # store a reduced array with only CATG
        catg = np.array(
            [np.sum(self.arrayed == i, axis=0)
                for i in [b'C', b'A', b'T', b'G']],
            dtype='uint32').T
        self.catarr[cidx, :catg.shape[0], :] = catg

        # store the seqdata and advance counters
        self.storeseq[cidx] = b"".join(list(self.consens))
        self.counters["name"] += 1
        self.counters["nconsens"] += 1
        self.counters["heteros"] += self.nheteros

    # ----------------------------------------------
    def write_chunk(self):

        # find last empty size
        end = np.where(np.all(self.refarr == 0, axis=1))[0]
        if np.any(end):
            end = end.min()
        else:
            end = self.refarr.shape[0]

        # write final consens string chunk
        consenshandle = os.path.join(
            self.data.tmpdir,
            "{}_tmpcons.{}.{}".format(self.sample.name, end, self.tmpnum))

        # write chunk. 
        if self.storeseq:
            with open(consenshandle, 'wt') as outfile:

                # denovo just write the consens simple
                if not self.isref:
                    outfile.write(
                        "\n".join(
                            [">" + self.sample.name + "_" + str(key) + \
                            "\n" + self.storeseq[key].decode() 
                            for key in self.storeseq]))

                # reference needs to store if the read is revcomp to reference
                else:
                    outfile.write(
                        "\n".join(
                            ["{}:{}:{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}"
                            .format(
                                self.sample.name,
                                self.refarr[i][0],
                                self.refarr[i][1],
                                self.refarr[i][2],
                                0,
                                self.revdict[self.refarr[i][0] - 1],
                                self.refarr[i][1],
                                0,

                                # why doesn't this line up with +2 ?
                                # make_indel_cigar(storeseq[i].decode()),
                                #"{}M".format(refarr[i][2] - refarr[i][1]),
                                "{}M".format(len(self.storeseq[i].decode())),
                                "*",
                                0,

                                ## + 2 here
                                self.refarr[i][2] - self.refarr[i][1],
                                #len(self.storeseq[i].decode()),
                                self.storeseq[i].decode(),
                                "*",
                                # "XT:Z:{}".format(
                                    # make_indel_cigar(storeseq[i].decode())
                                    # ),

                            ) for i in self.storeseq.keys()]
                            ))

        # store reduced size arrays with indexes matching to keep indexes
        tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
        with h5py.File(tmp5, 'w') as io5:
            io5.create_dataset(name="cats", data=self.catarr[:end])
            io5.create_dataset(name="alls", data=self.nallel[:end])
            io5.create_dataset(name="chroms", data=self.refarr[:end])
        del self.catarr
        del self.nallel
        del self.refarr

        # return stats
        self.counters['nsites'] = sum([len(i) for i in self.storeseq.values()])
        del self.storeseq
        #return self.counters, self.filters

In [87]:

with open(self.chunkfile, 'rb') as inclust:
    pairdealer = izip(*[iter(inclust)] * 2)
    for i in range(20):
        _, chunk = clustdealer(pairdealer, 1)
        if chunk:
            self.parse_cluster(chunk)
            if self.filter_mindepth():
                print(self.reps)
                self.build_consens_and_array()
                
                # get stacks of base counts
                sseqs = [list(seq) for seq in self.seqs]
                arrayed = np.concatenate(
                    [[seq] * rep for (seq, rep) in zip(sseqs, self.reps)]
                ).astype(bytes)
                self.arrayed = arrayed[:, :self.maxlen]
              
                # get consens call for each site, applies paralog-x-site filter
                self.consens = base_caller(
                    arrayed, 
                    self.data.paramsdict["mindepth_majrule"], 
                    self.data.paramsdict["mindepth_statistical"],
                    self.esth, 
                    self.este,
                )
                self.consens[self.consens == b"-"] = b"N"
                trim = np.where(self.consens != b"N")[0]
                ltrim, rtrim = trim.min(), trim.max()
                self.consens = self.consens[ltrim:rtrim + 1]
                self.arrayed = self.arrayed[:, ltrim:rtrim + 1]
                print(self.ref_position)
      
                # update position for trimming
                self.ref_position = (
                    self.ref_position[0], 
                    self.ref_position[1] + ltrim,
                    self.ref_position[1] + ltrim + rtrim + 1,
                    )
                
                print(self.ref_position)

[5, 1]
(1, 3481, 1)
(1, 3481, 3748)
[6]
(1, 30443, 1)
(1, 30443, 30713)
[9, 1, 1]
(1, 33785, 1)
(1, 33785, 34014)

In [39]:

# min where it is not N
np.argmin(np.where(arr != "N"))
np.where(arr != "N")

Out[39]:

(array([ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]),)

In [40]:

consens = arr
trim = np.where(consens != "N")[0]
ltrim, rtrim = trim.min(), trim.max()
consens = consens[ltrim:rtrim + 1]

In [71]:

print(consens)
print(pos[0] + ltrim, rtrim + 1)

['A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'A' 'a']
7 18

In [54]:

arr[7:17]

Out[54]:

array(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'], dtype='<U1')

In [4]:

test2.run("5", force=True)

Assembly: test2
[####################] 100% 0:00:13 | calculating depths   | s5 |
[####################] 100% 0:00:25 | chunking clusters    | s5 |
[####################] 100% 0:04:25 | consens calling      | s5 |
[####################] 100% 0:01:13 | indexing alleles     | s5 |

In [50]:

from ipyrad.assemble.consens_se import *
self = Step5(test2, True, ipyclient)
self.remote_calculate_depths()
self.remote_make_chunks()

[####################] 100% 0:00:17 | calculating depths   | s5 |
[####################] 100% 0:00:29 | chunking clusters    | s5 |

In [54]:

self.data.tmpdir
test2.tmpdir

Out[54]:

'/home/deren/local/src/ipyrad/tests/6-tortas/test2-tmpdir'

In [58]:

data = test2
sample = data.samples["PZ03concat"]
isref = True

for sample in [sample]:
    chunks = glob.glob(os.path.join(
        self.data.tmpdir,
        "{}.chunk.*".format(sample.name)))
    chunks.sort(key=lambda x: int(x.split('.')[-1]))
chunkfile = chunks[0]
chunkfile

Out[58]:

'/home/deren/local/src/ipyrad/tests/6-tortas/test2-tmpdir/PZ03concat.chunk.35195.0'

In [125]:

# temporarily store the mean estimates to Assembly
este = data.stats.error_est.mean()
esth = data.stats.hetero_est.mean()

# get index number from tmp file name
tmpnum = int(chunkfile.split(".")[-1])
optim = int(chunkfile.split(".")[-2])

# prepare data for reading and use global maxfraglen
clusters = open(chunkfile, 'rb')
pairdealer = izip(*[iter(clusters)] * 2)
maxlen = data._hackersonly["max_fragment_length"]

# local copies to use to fill the arrays
catarr = np.zeros((optim, maxlen, 4), dtype=np.uint32)
nallel = np.zeros((optim, ), dtype=np.uint8)
refarr = np.zeros((optim, 3), dtype=np.int64)

# if reference-mapped then parse the fai to get index number of chroms
if isref:
    fai = pd.read_csv(
        data.paramsdict["reference_sequence"] + ".fai",
        names=['scaffold', 'size', 'sumsize', 'a', 'b'],
        sep="\t")
    faidict = {j: i for i, j in enumerate(fai.scaffold)}
    revdict = {j: i for i, j in faidict.items()}

# store data for stats counters.
counters = {
    "name": tmpnum,
    "heteros": 0,
    "nsites": 0,
    "nconsens": 0,
}

# store data for what got filtered
filters = {
    "depth": 0,
    "maxh": 0,
    "maxn": 0,
}

# store data for writing
storeseq = {}

## set max limits
if 'pair' in data.paramsdict["datatype"]:
    maxhet = sum(data.paramsdict["max_Hs_consens"])
    maxn = sum(data.paramsdict["max_Ns_consens"])
else:
    maxhet = data.paramsdict["max_Hs_consens"][0]
    maxn = data.paramsdict["max_Ns_consens"][0]

In [126]:

# load the refmap dictionary if refmapping
done = 0
while counters["name"] < 20:
    done, chunk = clustdealer(pairdealer, 1)
    if chunk:            
        # get names and seqs
        piece = chunk[0].decode().strip().split("\n")
        names = piece[0::2]
        seqs = piece[1::2]

        # pull replicate read info from seqs
        reps = [int(sname.split(";")[-2][5:]) for sname in names]
        ref_position = (-1, 0, 0)
        if isref:
            # parse position from name string
            rname = names[0].rsplit(";", 2)[0]
            chrom, posish = rname.rsplit(":")
            pos0, pos1 = posish.split("-")

            # pull idx from .fai reference dict
            chromint = faidict[chrom] + 1
            ref_position = (int(chromint), int(pos0), int(pos1))

        # apply filters and fill arrays
        if nfilter1(data, reps):

            # get stacks of base counts
            sseqs = [list(seq) for seq in seqs]
            arrayed = np.concatenate(
                [[seq] * rep for seq, rep in zip(sseqs, reps)]
            ).astype(bytes)
            arrayed = arrayed[:, :maxlen]

            # get consens call for each site, applies paralog-x-site filter
            consens = base_caller(
                arrayed, 
                data.paramsdict["mindepth_majrule"], 
                data.paramsdict["mindepth_statistical"],
                esth, 
                este,
            )

            print(rname)
            print(len(consens), ref_position[2] - ref_position[1])
            
            # apply a filter to remove low coverage sites/Ns that
            # are likely sequence repeat errors. This is only applied to
            # clusters that already passed the read-depth filter (1)
            # and is not applied to ref aligned data which is expected to
            # have internal spacers that we leave in place here. 
            if b"N" in consens:
                if not isref:
                    try:
                        consens, arrayed = removerepeats(consens, arrayed)

                    except ValueError:
                        ip.logger.info(
                            "Caught bad chunk w/ all Ns. Skip it.")
                        continue
                else:
                    pass #print("here")

            # get hetero sites
            hidx = [
                i for (i, j) in enumerate(consens) if j.decode() 
                in list("RKSYWM")]
            nheteros = len(hidx)

            # filter for max number of hetero sites
            if nfilter2(nheteros, maxhet):
                # filter for maxN, & minlen
                if nfilter3(consens, maxn):
                    # counter right now
                    current = counters["nconsens"]
                    # get N alleles and get lower case in consens
                    consens, nhaps = nfilter4(consens, hidx, arrayed)
                    # store the number of alleles observed
                    nallel[current] = nhaps
                    # store a reduced array with only CATG
                    catg = np.array(
                        [np.sum(arrayed == i, axis=0)
                            for i in [b'C', b'A', b'T', b'G']],
                        dtype='uint32').T
                    catarr[current, :catg.shape[0], :] = catg
                    refarr[current] = ref_position

                    # store the seqdata for tmpchunk
                    storeseq[current] = b"".join(list(consens))
                    counters["name"] += 1
                    counters["nconsens"] += 1
                    counters["heteros"] += nheteros
                else:
                    filters['maxn'] += 1
            else:
                filters['maxh'] += 1
        else:
            filters['depth'] += 1

# close infile io
clusters.close()

Contig0:3481-3748
267 267
Contig0:30443-30713
270 270
Contig0:33785-34014
229 229
Contig0:37976-38629
653 653
Contig0:40388-40731
343 343
Contig0:41920-42163
243 243
Contig0:44568-44784
216 216
Contig0:55978-56333
355 355
Contig0:62256-62713
457 457
Contig0:64734-65000
266 266
Contig0:67768-68840
960 1072
Contig0:69863-70402
539 539
Contig0:76480-76711
231 231
Contig0:81339-81852
513 513
Contig0:82469-82923
454 454
Contig0:83146-83367
221 221
Contig0:95645-95887
242 242
Contig0:101553-101788
235 235
Contig0:103469-103709
240 240
Contig0:104258-104956
698 698

In [127]:

67768-68840

Out[127]:

-1072

In [112]:

ttt = list("NNNNNNNNNNNAAAAAAAAAAAAAAAAAAANNNNNNNNNNNNNN")
ttt = np.array(ttt, dtype=bytes)
ttt

Out[112]:

array([b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
       b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A',
       b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'A', b'N', b'N', b'N',
       b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N'],
      dtype='|S1')

In [102]:

ttt.nbytes

Out[102]:

In [107]:

ttt = np.array(list(b"NNNNNNNNNNNAAAAAAAAAAAAAAAAAAANNNNNNNNNNNNNN"))
ttt.nbytes

Out[107]:

In [83]:

np.argmin(consens == b"N"), np.argmax(consens == b"N")

Out[83]:

(0, 69)

In [17]:

# find last empty size
end = np.where(np.all(refarr == 0, axis=1))[0]
if np.any(end):
    end = end.min()
else:
    end = refarr.shape[0]

# write final consens string chunk
consenshandle = os.path.join(
    data.tmpdir,
    "{}_tmpcons.{}.{}".format(sample.name, end, tmpnum))

# write chunk. 
if storeseq:
    with open(consenshandle, 'wt') as outfile:

        # denovo just write the consens simple
        if not isref:
            outfile.write(
                "\n".join([">" + sample.name + "_" + str(key) + \
                "\n" + storeseq[key].decode() for key in storeseq]))

        # reference needs to store if the read is revcomp to reference
        else:
            outfile.write(
                "\n".join(
                    ["{}:{}:{}-{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}"
                    .format(
                        sample.name,
                        refarr[i][0],
                        refarr[i][1],
                        refarr[i][2],
                        0,
                        revdict[refarr[i][0] - 1],
                        refarr[i][1],
                        0,

                        # why doesn't this line up with +2 ?
                        # make_indel_cigar(storeseq[i].decode()),
                        #"{}M".format(refarr[i][2] - refarr[i][1]),
                        "{}M".format(len(storeseq[i].decode())),
                        "*",
                        0,

                        ## + 2 here
                        refarr[i][2] - refarr[i][1],

                        storeseq[i].decode(),
                        "*",
                        # "XT:Z:{}".format(
                            # make_indel_cigar(storeseq[i].decode())
                            # ),

                    ) for i in storeseq.keys()]
                    ))

# store reduced size arrays with indexes matching to keep indexes
tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
with h5py.File(tmp5, 'w') as io5:
    io5.create_dataset(name="cats", data=catarr[:end])
    io5.create_dataset(name="alls", data=nallel[:end])
    io5.create_dataset(name="chroms", data=refarr[:end])
del catarr
del nallel
del refarr

# return stats
counters['nsites'] = sum([len(i) for i in storeseq.values()])
del storeseq
return counters, filters

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-c185a885e7fd> in <module>()
    110 # write final consens string chunk
    111 consenshandle = os.path.join(
--> 112     data.tmpdir,
    113     "{}_tmpcons.{}.{}".format(sample.name, end, tmpnum))
    114 

AttributeError: 'Assembly' object has no attribute 'tmpdir'

In [ ]:

In [98]:

data = ip.load_json("6-tortas/tortas.json")

loading Assembly: tortas
from saved path: ~/local/src/ipyrad/tests/6-tortas/tortas.json

In [12]:

data = ip.Assembly("tortas")
data.set_params("project_dir", "6-tortas")
data.set_params("sorted_fastq_path", "/home/deren/Documents/Tortoise/fastq-concats-GG-d10-118/*.gz")
data.set_params("restriction_overhang", ("CATG", "AATT"))
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "/home/deren/Documents/Tortoise/lgeorge.genome.fa")
data.get_params()

New Assembly: tortas
0   assembly_name               tortas                                       
1   project_dir                 ./6-tortas                                   
2   raw_fastq_path                                                           
3   barcodes_path                                                            
4   sorted_fastq_path           /home/deren/Documents/Tortoise/fastq-concats-GG-d10-118/*.gz
5   assembly_method             reference                                    
6   reference_sequence          /home/deren/Documents/Tortoise/lgeorge.genome.fa
7   datatype                    pairddrad                                    
8   restriction_overhang        ('CATG', 'AATT')                             
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            6                                            
13  maxdepth                    10000                                        
14  clust_threshold             0.85                                         
15  max_barcode_mismatch        0                                            
16  filter_adapters             0                                            
17  filter_min_trim_len         35                                           
18  max_alleles_consens         2                                            
19  max_Ns_consens              (5, 5)                                       
20  max_Hs_consens              (8, 8)                                       
21  min_samples_locus           4                                            
22  max_SNPs_locus              (20, 20)                                     
23  max_Indels_locus            (8, 8)                                       
24  max_shared_Hs_locus         0.5                                          
25  trim_reads                  (0, 0, 0, 0)                                 
26  trim_loci                   (0, 0, 0, 0)                                 
27  output_formats              ['p', 's', 'v']                              
28  pop_assign_file

In [5]:

data.ipcluster['threads'] = 4

In [6]:

data.run("2")

Assembly: tortas
[####################] 100% 1:22:46 | processing reads     | s2 |

In [7]:

data.stats.head()

Out[7]:

	state	reads_raw	reads_passed_filter
AGO02concat	2	11050294	11010349
AGO08concat	2	13408401	13343190
AGO09concat	2	15650127	15593137
AGO11concat	2	12848936	12766537
AGO12concat	2	12590531	12563936

In [12]:

from ipyrad.assemble.clustmap import *
self = Step3(data, 8, True, ipyclient)

In [14]:

self.run()

[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:05 | concatenating        | s3 |
[####################] 100% 1:31:45 | join unmerged pairs  | s3 |
[####################] 100% 1:34:21 | dereplicating        | s3 |
[####################] 100% 0:45:59 | splitting dereps     | s3 |
[####################] 100% 6:06:27 | mapping reads        | s3 |
[####################] 100% 4:20:31 | building clusters    | s3 |
[####################] 100% 0:01:17 | calc cluster stats   | s3 |

In [ ]:

In [38]:

test = self.data.branch('test', [i.name for i in self.samples[:4]])

In [40]:

test.run('45')

Assembly: test
[####################] 100% 0:10:21 | inferring [H, E]     | s4 |
[####################] 100% 0:00:16 | calculating depths   | s5 |
[####################] 100% 0:00:29 | chunking clusters    | s5 |
[####################] 100% 0:14:46 | consens calling      | s5 |
[####################] 100% 0:01:30 | indexing alleles     | s5 |

In [3]:

test = ip.load_json("6-tortas/test.json")

loading Assembly: test
from saved path: ~/local/src/ipyrad/tests/6-tortas/test.json

In [29]:

test.run("5", force=True)

Assembly: test
[####################] 100% 0:00:15 | calculating depths   | s5 |
[####################] 100% 0:00:27 | chunking clusters    | s5 |
[####################] 100% 0:14:45 | consens calling      | s5 |
[####################] 100% 0:01:30 | indexing alleles     | s5 |

In [4]:

data = test
samples = list(data.samples.values())

In [5]:

from ipyrad.assemble.cluster_across import *
self = Step6(data, True, ipyclient)

In [6]:

self.remote_concat_bams()

[####################] 100% 0:00:54 | concatenating bams   | s6 |

In [40]:

self.remote_build_ref_regions()

[####################] 100% 0:00:13 | building clusters    | s6 |

In [41]:

# prepare i/o
bamfile = AlignmentFile(
    os.path.join(
        self.data.dirs.across,
        "{}.cat.sorted.bam".format(self.data.name)),
    'rb')
outfile = gzip.open(
    os.path.join(
        self.data.dirs.across,
        "{}.catcons.gz".format(self.data.name)),
    'wb')

# write header with sample names
outfile.write(
    b" ".join([i.name.encode() for i in self.samples]) + b"\n")      

Out[41]:

In [35]:

def remote_build_ref_clusters(self):
    "build clusters and find variants/indels to store"

    # prepare i/o
    bamfile = AlignmentFile(
        os.path.join(
            self.data.dirs.across,
            "{}.cat.sorted.bam".format(self.data.name)),
        'rb')
    outfile = gzip.open(
        os.path.join(
            self.data.dirs.across,
            "{}.catcons.gz".format(self.data.name)),
        'wb')

    # write header with sample names
    snames = sorted([i.name for i in self.samples])
    nsamples = len(snames)
    outfile.write(
        b" ".join([i.encode() for i in snames]) + b"\n")

    # get clusters
    iregions = iter(self.regions)
    clusts = []
    while 1:
        # pull in the cluster
        try:
            region = next(iregions)
            reads = bamfile.fetch(*region)
        except StopIteration:
            break

        # pull in the reference for this region
        refn, refs = get_ref_region(
            data.paramsdict["reference_sequence"], 
            region[0], region[1] + 1, region[2] + 1,
            )

        # build cluster dict for sorting                
        rdict = {}
        for read in reads:
            rdict[read.qname] = read.seq   
        keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])

        # make empty array
        arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)

        # fill it
        arr[0] = list(refs)
        for idx, key in enumerate(keys):
            # get start and stop from name
            sname = key.rsplit("_", 1)[0]
            rstart, rstop = key.rsplit(":", 2)[-1].split("-")
            sidx = snames.index(sname)

            # get start relative to ref
            start = int(rstart) - int(region[1]) - 1
            stop = start + int(rstop) - int(rstart)
            print(sidx + 1, start, stop, arr.shape[1])
            try:
                arr[sidx + 1, int(start): int(stop)] = list(rdict[key])
            except ValueError:
                print(rdict[key])

        # get consens seq and variant site index 
        print("")
        clust = []
        avars = refvars(arr.view(np.uint8), PSEUDO_REF)
        dat = b"".join(avars.view("S1")[:, 0]).decode()
        snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
        clust.append("ref_{}:{}-{}\n{}".format(*region, dat))

        # or, just build variant string (or don't...)
        # write all loci with [locids, nsnps, npis, nindels, ?]
        # for key in keys:
        #     clust.append("{}\n{}".format(key, rdict[key]))
        # clust.append("SNPs\n" + snpstring)
        # clusts.append("\n".join(clust))

    outfile.write(str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
    outfile.close()

In [63]:

iregions = iter(self.regions[:50])

def build_ref_clusters2(self):
    "build clusters and find variants/indels to store"
    # prepare i/o
    bamfile = AlignmentFile(
        os.path.join(
            self.data.dirs.across,
            "{}.cat.sorted.bam".format(self.data.name)),
        'rb')
    outfile = gzip.open(
        os.path.join(
            self.data.dirs.across,
            "{}.catcons.gz".format(self.data.name)),
        'wb')

    # write header with sample names
    snames = sorted([i.name for i in self.samples])
    nsamples = len(snames)
    outfile.write(
        b" ".join([i.encode() for i in snames]) + b"\n")

    # get clusters
    clusts = []
    while 1:
        # pull in the cluster
        try:
            region = next(iregions)
            reads = bamfile.fetch(*region)
        except StopIteration:
            break

        # pull in the reference for this region
        refn, refs = get_ref_region(
            data.paramsdict["reference_sequence"], 
            region[0], region[1] + 1, region[2] + 1,
            )

        # build cluster dict for sorting                
        rdict = {}
        for read in reads:
            rdict[read.qname] = read.seq   
        keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])

        # make empty array
        arr = np.zeros((len(rdict), len(refs)), dtype=bytes)
       
        # fill it
        arr[0] = list(refs)
        for idx, key in enumerate(keys):
            # get start and stop from name
            sname = key.rsplit("_", 1)[0]
            rstart, rstop = key.rsplit(":", 2)[-1].split("-")

            # get start relative to ref
            start = int(rstart) - int(region[1]) - 1
            stop = start + int(rstop) - int(rstart)
            try:
                sequence = list(rdict[key])
                arr[idx, int(start): int(stop)] = sequence
            except ValueError:
                print(key)
                print(sname, len(sequence), start, stop, arr.shape[1])
                print(rdict[key])       
        
        
        
        
    return arr
        
        
build_ref_clusters2(self)

PZ03concat_10:1:67768-68840
PZ03concat 960 0 1072 1301
CATGAGATGCTAACGAGGTCTCTCAAAGCTCCAGGCCCAGCTGACTGACAGCTTTGCATACTGTTGTTCTTNNNNNNNNNNNNNNNNNNNNTTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCNCCTCCCCTCAATAATGATACTTAGCACTTGATGAATTNNNNNNNNNNNNNNNNNNNNAATTAAATAAGCCTTACCTGCTGTGAAATGTCAATACTATTTTGAAGATGGGGAAACCGAGGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTTCTCAGGGGTCCAGCAACCTTCAAGCCCTTGTTCTCAAGATCCCCTTTGGAATAAGACCTCGTCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAGTCCAGTCCCCTGCCCTCATGGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

Out[63]:

array([[b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',
        b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',
        b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',
        b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',
        b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',
        b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',
        b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',
        b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',
        b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',
        b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',
        b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',
        b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',
        b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',
        b'T'],
       [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',
        b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',
        b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',
        b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',
        b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',
        b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',
        b'G', b'C', b'T', b'G', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',
        b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',
        b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',
        b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',
        b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',
        b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',
        b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',
        b'T'],
       [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',
        b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',
        b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',
        b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',
        b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',
        b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',
        b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',
        b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',
        b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',
        b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',
        b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',
        b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',
        b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',
        b'T'],
       [b'C', b'A', b'T', b'G', b'C', b'T', b'T', b'C', b'A', b'C', b'C',
        b'A', b'G', b'C', b'C', b'C', b'A', b'T', b'A', b'G', b'C', b'G',
        b'G', b'T', b'G', b'G', b'G', b'C', b'T', b'C', b'T', b'G', b'T',
        b'G', b'C', b'C', b'C', b'C', b'C', b'C', b'A', b'C', b'T', b'C',
        b'T', b'G', b'C', b'C', b'C', b'A', b'C', b'A', b'G', b'G', b'T',
        b'A', b'G', b'G', b'C', b'T', b'G', b'G', b'G', b'C', b'T', b'G',
        b'G', b'C', b'T', b'G', b'T', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N', b'N',
        b'N', b'N', b'C', b'C', b'C', b'G', b'G', b'C', b'A', b'G', b'G',
        b'C', b'T', b'T', b'G', b'G', b'A', b'C', b'A', b'A', b'A', b'G',
        b'A', b'C', b'G', b'T', b'T', b'A', b'A', b'G', b'T', b'T', b'T',
        b'T', b'C', b'A', b'A', b'G', b'C', b'C', b'C', b'C', b'A', b'A',
        b'A', b'T', b'G', b'T', b'T', b'T', b'T', b'T', b'G', b'G', b'A',
        b'G', b'G', b'A', b'G', b'A', b'T', b'G', b'G', b'T', b'T', b'C',
        b'C', b'C', b'C', b'C', b'C', b'T', b'T', b'A', b'A', b'A', b'T',
        b'T']], dtype='|S1')

In [ ]:

        # get consens seq and variant site index 
        print("")
        clust = []
        avars = refvars(arr.view(np.uint8), PSEUDO_REF)
        dat = b"".join(avars.view("S1")[:, 0]).decode()
        snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
        clust.append("ref_{}:{}-{}\n{}".format(*region, dat))

        # or, just build variant string (or don't...)
        # write all loci with [locids, nsnps, npis, nindels, ?]
        # for key in keys:
        #     clust.append("{}\n{}".format(key, rdict[key]))
        # clust.append("SNPs\n" + snpstring)
        # clusts.append("\n".join(clust))

    outfile.write(str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
    outfile.close()

In [34]:

remote_build_ref_clusters(self)

1 0 234 234
2 0 234 234
3 0 234 234
4 0 234 234

1 0 919 919
2 0 267 919
3 0 919 919
4 0 261 919

1 201 462 462
2 201 462 462
3 201 462 462
4 0 462 462

1 0 237 237
2 0 237 237
3 0 237 237
4 0 237 237

1 0 238 238
2 0 238 238
3 0 238 238
4 0 238 238

1 0 590 960
2 0 590 960
3 0 1054 960
4 0 636 960

1 0 257 257

2 0 248 248

1 0 887 887
2 78 289 887

1 0 434 623
2 222 623 623

1 0 510 674
2 271 510 674
3 271 510 674
4 271 674 674

3 0 328 328

1 244 494 649
2 0 494 649
3 0 649 649
4 0 494 649

1 0 545 545
2 0 545 545
3 0 545 545
4 0 545 545

1 254 501 1012
2 254 1012 1012
4 0 501 1012

2 0 202 202

1 0 827 827

1 0 243 243
2 0 243 243
3 0 243 243
4 0 243 243

1 0 378 378
2 163 378 378

1 0 223 223
2 0 223 223
3 0 223 223
4 0 223 223

1 0 562 562
2 0 228 562
3 0 228 562
4 0 228 562

1 0 254 254
2 0 254 254
3 0 254 254
4 0 254 254

4 0 259 259

1 0 468 468
2 58 273 468
3 0 273 468

1 0 775 775
2 343 578 775
3 0 578 775
4 343 578 775

1 0 211 211
3 0 211 211

1 0 228 228
2 0 228 228
3 0 228 228
4 0 228 228

4 0 854 854

1 0 238 238
2 0 238 238
3 0 238 238
4 0 238 238

1 274 765 765
2 0 765 765
3 0 765 765
4 274 765 765

1 0 219 219
2 1 219 219
3 1 219 219
4 1 219 219

1 180 422 422
2 0 422 422
3 180 422 422
4 180 422 422

1 0 231 231
2 0 231 231
3 0 231 231
4 0 231 231

1 44 288 695
2 44 288 695
3 0 695 695
4 44 695 695

1 0 251 251
2 0 251 251
3 0 251 251
4 0 251 251

1 0 242 552
2 0 242 552
3 0 552 552
4 0 242 552

1 0 258 258
2 0 258 258
3 0 258 258
4 0 258 258

1 0 583 868
2 376 868 868
3 376 583 868

4 0 874 874

2 0 459 539
3 0 459 539
4 0 539 539

1 118 383 383
2 118 383 383
4 0 383 383

1 274 737 737
2 0 737 737
3 274 737 737
4 0 737 737

1 0 249 249
2 0 249 249
3 4 249 249
4 0 249 249

1 392 858 858
3 0 801 858

1 2 219 219
2 0 219 219

2 502 992 992
3 0 760 992
4 502 760 992

2 0 207 207

3 0 448 448

1 0 481 481
2 0 215 481
3 0 215 481

1 0 536 536
2 276 536 536
3 0 536 536
4 0 536 536

1 0 631 769
2 0 631 769
3 0 631 769
4 0 769 769

2 0 569 842
3 358 842 842
4 0 842 842

1 139 620 620
2 356 620 620
3 0 620 620
4 0 620 620

1 216 471 471
2 0 471 471
3 0 471 471
4 216 471 471

3 0 215 215

1 145 367 598
2 145 598 598
3 0 367 598
4 145 598 598

1 0 223 223
2 0 223 223
3 0 223 223
4 0 223 223

3 0 210 210

4 0 476 476

1 272 505 690
2 272 690 690
3 157 505 690
4 0 505 690

1 247 532 532
2 0 460 532
3 247 461 532

1 0 217 389
2 0 389 389
3 0 389 389

4 0 1118 960

1 0 514 514
2 268 514 514
3 268 514 514
4 268 514 514

4 0 270 270

1 142 398 748
2 142 398 748
3 0 748 748
4 142 398 748

3 0 546 546

2 0 258 258
4 0 258 258

1 0 225 225
2 0 225 225
3 0 225 225
4 0 225 225

1 1047 1689 1929
2 338 1262 1929
2 1469 1929 1929
3 1046 1689 1929
4 0 820 1929
4 1047 1689 1929

1 136 355 607
2 136 355 607
3 136 607 607
4 0 355 607

1 0 509 509
2 0 415 509
3 0 509 509
4 0 247 509

1 0 355 355

1 0 490 491
2 258 491 491
3 0 490 491
4 258 490 491

4 0 835 835

3 0 227 227

1 0 214 214
2 0 214 214
3 0 214 214

1 431 1061 1195
2 235 1235 1195
3 431 1061 1195
4 0 1235 1195
CATGGAGTTGATGTGCCAAGGACAGATCAGAAACCAAATAGAGGGATAAAGAGAGTCCCTTTACAACTTGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACAAGGCTCTTCACTTTAACGCTCAGGCTTTGTTTACACTGGGAGATGGTGTTCCAACACTGGCAGAAAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGGGGCTTGACTCCTGTTTGTGCCTCTGAAAATCTCTCCTTTGGTGCTCGATGCTGTGCTTCTTACCACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTTGTCAGCAGCAGAAGTGAATCTTAGCAGCAGGCATAAACATATTTGCCTCTTCCGTTCCCAAGAAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTACATATCAATGGCTTTTCTAATGGCCAGTGCTGCAAACTTCACTTGGGATCTGAAAATCAGCTGGGTTCTCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

4 0 762 762

1 0 510 510
2 0 383 510
3 0 383 510
4 140 383 510

1 450 677 677
2 450 677 677
3 0 677 677
4 450 677 677

1 185 518 518
2 285 518 518
3 285 518 518
4 0 518 518

1 0 510 644
2 0 644 644
3 0 510 644
4 0 510 644

2 0 590 590
4 0 590 590

1 0 222 222
2 0 222 222
3 0 222 222
4 0 222 222

1 0 507 507
2 0 507 507
3 0 507 507
4 0 507 507

1 0 551 551

1 0 442 442
4 191 442 442

1 296 553 553
2 296 553 553
3 296 553 553
4 0 553 553

1 0 438 438
2 0 258 438
3 0 258 438
4 0 438 438

1 0 352 767
2 109 767 767
3 109 352 767
4 109 352 767

1 0 240 241
2 0 241 241
3 0 240 241
4 0 240 241

3 0 416 416

1 0 516 516
2 0 227 516
3 0 227 516
4 0 227 516

2 0 327 327

1 0 892 892
2 278 892 892
3 186 723 892
4 278 892 892

1 0 470 470
2 0 470 470
3 0 470 470
4 0 470 470

1 0 1205 1383
CATGAGAGGTTATGCAGATGGAATGTGAAAATGATCAATAAAGAGAGTTGAAGTATTCCAGGCACCTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGAGTTTCCTAGCAAAGGGCTGCTCATTTCATCCTGTACCTAGGCTGTCTCACTGCCTCCTGCTAGGGATTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTAGACTTTGTCTCCAAAGCAGCTTGTTTTGTTCATTCAATGAAAAGGTGCCACGTTCTCTTCCCTAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATCTGCAAAGGGAAAAAAAAAAAGGGGGAGCTATGGGGGGTTAGCATCCTTGCGGGAGATAGATAAGTACTTAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCTGTCAGTCTAGCTCCTGCTACTCAAGGAGGTGGATTACCTACCTCAATAGGAGAACCTCCTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
2 99 643 1383
2 975 1205 1383
3 424 1205 1383
4 99 643 1383
4 975 1383 1383

1 0 255 410
2 0 255 410
3 0 255 410
4 0 410 410

1 0 249 1239
1 640 1239 1239
2 0 249 1239
2 640 1239 1239
3 0 1239 1239
CATGTCATTTTCTTTGAGTTTTTCCCCTTTCACCTGGAAACTATCAAACCCTTGAAACTCTGGGTTGTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATTGTTCTGACCATCTTCCTGCGGCTGAGGCATCTCAAGAAACTGAGAAACGTGAGGTGCGTTAGGGACCAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGTCATTGCCCATTCTATGCGATATATCCGGGGATCGAAAAAGCACCCACAAGGTGCCATCTGGAAACCTGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGCCAAGACCCGCCAAAGGCCAGATCCTCAGCTGGTGTAAATTGGTGGTGTAGTTCCGGTGACTTCAATGGGGCTGAGCCACTGTACACCAGCTAAGGGTCCAGCACAAAGGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAGCGTCACTTGCACCGAACTGGGATTTGGGGCAAATGCAGTCTGGAGTCAGAGGGTGCAGGTTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
4 0 1239 1239
CATGTCATTTTCTTTGAGTTTTTCCCCTTTCACCTGGAAACTATCAAACCCTTGAAACTCTGGGTTGTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATTGTTCTGACCATCTTCCTGCGGCTGAGGCATCTCAAGAAACTGAGAAACGTGAGGTGCGTTAGGGACCAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATGTATTGGGGGTTGCTTGGGACTGGGTGGTAAGGTGGGTAGGGGACTGTTTGGACAGGACTCTTGANNNNNNCGTGGGTGTCCAGAAGGTAAGCGCTTGGTGAACTTGATCCACCCGTGAGTTTATAAACTGACGGTTGCACGAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGGTGGTGTAGTTCCGGTGACTTCAATGGGGCTGAGCCACTGTACACCAGCTAAGGGTCCAGCACAAAGGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATAGCGTCACTTGCACCGAACTGGGATTTGGGGCAAATGCAGTCTGGAGTCAGAGGGTGCAGGTTCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

1 0 254 254
2 0 254 254
3 0 254 254
4 0 254 254

2 0 511 511
4 0 511 511

1 584 1015 1015
2 584 1015 1015
3 0 1016 1015
CATGTCTGAGTCTGTATCTCCTGAAGATGGAGCTATCCAGTTCCCTGCTACATATGTGCCAGTTCAACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCCATTAACATACATAGAGAGACCTAGGCATCCCCTAGTATTATTTCAGGTAGAAGCAACATACAACTGTTGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTCCTAAGGGAGGCGAAAGGCCAAGTTAAAGGATGCCCAACTAGGAGAGGAGCAATTNNNNNNNNNNNNNNNNCATGGGAGGGGAGCGGCTGCAATGATGTCTGGAAAGAAGGGGAACAAAAGGTCTTGTGGTTTGGATGCAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATCTCCAGATCTGTGTGAACTTCAGTTTTAGGAGCTGTGAACCTCTTGAGCGATACTTGTAGGAGATTTCAAATTACCCCTCTGCTGTGTAATAGGGATGGAAGAAGCCTCCAGACCAAAGTAGTTTAGCACAAGCGGATGCCCCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNANAAAACNCTGCCCTG
4 584 1015 1015

1 0 234 239
2 0 234 239
3 0 239 239
4 0 239 239

1 272 541 541
2 0 541 541
3 316 541 541
4 0 541 541

1 0 217 217
2 0 217 217
3 0 217 217

1 361 582 788
2 361 582 788
3 0 788 788
4 361 582 788

1 0 558 558

1 468 722 722
2 468 722 722
3 0 722 722
4 468 722 722

2 0 214 215
3 0 214 215
4 0 215 215

1 0 230 230
2 0 230 230
3 0 230 230
4 0 230 230

1 0 427 427
2 0 427 427
3 0 427 427
4 0 427 427

1 0 847 848
2 134 847 848
3 0 363 848
3 625 848 848
4 134 363 848
4 625 847 848

1 0 247 247
2 0 247 247
3 0 247 247
4 0 247 247

1 0 217 736
1 496 736 736
2 0 217 736
2 496 736 736
3 0 736 736
4 0 217 736
4 496 736 736

1 0 305 451
2 0 451 451
4 0 451 451

1 0 244 244
2 0 244 244
3 0 244 244
4 0 244 244

1 0 265 265
4 0 265 265

1 116 390 390
2 0 341 390
3 116 341 390
4 116 341 390

1 0 237 237
2 0 237 237
3 0 237 237
4 0 237 237

1 231 487 487
2 231 487 487
3 231 487 487
4 0 487 487

1 0 223 223
2 0 223 223
3 0 223 223
4 0 223 223

1 0 226 226
2 0 226 226
4 0 226 226

1 305 528 528
2 305 528 528
3 0 528 528
4 305 528 528

4 0 263 263

1 0 583 583
2 94 583 583
3 94 307 583

1 612 1142 1142
2 337 1142 1142
3 0 1142 1142
CATGGGAAGCTTTGATCCCTTTCAGGAATCCTTAAGGGTGTGACATTTGAAGCTCTTCCAGAGACAGGATGNNNNNNNNNNNNNNNNNTGGTAAAGCCGTAAATCTTGGAAGCTCCGTGTTTGAGCCAAATCTACCCTAGGATGAGATGTAGCCTGCAGCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCCTCCGGAGGTTGTGGAATCTCCATCTCTGGAGATATTTAAGAGTAGTTTAGATAACTGTCTATCTGGGATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATCTATGAATCCTTTTATGTCTCCACTATCTGTGGTGGAGCAATTTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGAAGCTGGTTAGTTTGTTTTATTCCATTTCCTTCACTGCTGGGTTCGCCTCCTTCATCTCAGCAGCCTGGAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTTGATTTCCTTCACTTCTCAGCATTTCCCCTCTTGTTAGGACCTGATTCTCTGCAATCCTGTGCGTCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATGCAAGGCAAGGGAGAATCAGGCCAATGGAG
4 612 1142 1142

4 0 256 256

1 276 807 807
2 276 807 807
3 276 807 807
4 0 807 807

2 0 212 212

1 0 484 675
2 0 484 675
3 0 675 675
4 0 484 675

2 0 1255 1072
AATTCTTTGAGCTGCTGTTGATAAATCAGCCTGGAAGAGGTGGAACGTTGTGAAGATTCCCGGGGTACCCAGCGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCTATGCAGGTGAGCAAGAATCAGGCGTCCTAGACTCCACGGTGCAAATAAGTGACGGCCACGTTAACAYGACCCTGTACTGAAAACCCACGAACCACTATGCCTACCTTCATGCCTCCAGCTTCCATCCCGGACACACCACACGATCCATCGTCCACAGCCAAGTGCTGAGGTACAACTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGACGTGTACCTAGAAGTCTCCTGCTGCAAGACAAACCCAAGAAAGAAACCAACAGGACTCCGCTGGCCATCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCACAGGCCTTGGGTGGCAGGCCAGTCCTCGCCCCCAGACAGCCTGCCAACCTGAAGCATATTCTCACCAGTAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGCAACAAACCTTGATGCCAGCTATGCCCACATATCTACACCAGCGACACCATCACAGGATCTAACCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTCCACCAATGTAATATATGCCATCATGTGCCAGCAATGCCCCTCTGCTATGTACATCGGCCARACTGGACAGTCTCTACGGAAAAGGATAAATGGACACAAATCAGATATTAGGAATGGCAATCNNNNNNNNNNNNNNNNNNNNNNCT
3 0 496 1072
3 835 1072 1072
4 0 1072 1072
AATTCTTTGAGCTGCTGTTGATAAATCAGCCTGGAAGAGGTGGAACGTTGTGAAGATTCCCGGGGTACCCAGCGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAAATAAGTGACGGCCACGTTAACACGACCCTGTACTGAAAACCCACGAACCACTATGCCTACCTTCATGCCTCCAGCTTCCATCCCGGACACACCACACGATCCATCGTCCACAGCCAAGTGCTGAGGTACAACTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCAGACGTGTACCTAGAAGTCTCCTGCTGCAAGACAAACCCAAGAAAGAAACCAACAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACAAACCTCGATGCCAACTCTGCCCACATATCTACACCAGCGACACCATCACAGGATCTAACCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTGCCAGCAATGCCCCTCTGCTATGTACATCGGMCAAACTGGMCAGTCTCTAYGGAAAAGGATAAATGGACACAAATCAGATATTAGGAATGGCAATTNNNNNNNNNNNNNNNGAGAACACT

1 0 227 227
2 0 227 227
3 0 227 227
4 0 227 227

2 0 220 220
3 0 220 220
4 0 220 220

1 0 230 230
2 0 230 230
3 0 230 230
4 0 230 230

1 0 222 222
2 0 222 222
3 0 222 222
4 0 222 222

3 0 606 606

4 0 265 265

1 0 229 229
2 0 229 229
3 0 229 229
4 0 229 229

1 0 238 238
2 0 238 238
3 0 238 238
4 0 238 238

1 0 704 704
2 203 704 704
3 0 704 704
4 0 704 704

1 0 573 573
2 0 573 573
3 0 573 573
4 0 573 573

1 0 254 254
3 0 254 254

1 0 227 227
2 0 227 227
3 0 227 227
4 0 227 227

1 0 220 220
2 0 220 220

2 0 572 572

2 0 413 413

1 0 651 930
2 0 930 930
3 0 651 930
4 0 651 930

1 0 220 220
2 0 220 220
3 0 220 220
4 0 220 220

1 0 350 350
3 0 350 350
4 0 209 350

4 0 542 542

2 118 1169 1078
3 0 1169 1078
AAATTTGCTTCTCTGTTAACTGGGGAGAATAAACCTTCCTTTCCCCCGCNNTGTGCTCTGCCTTGTCTAGCTAGATTGTCAGCTCTTTGAGACAGGGACTGTCTCTTACAGTGCGTAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTGCCTTTAACAATCTCACCTCCTTATATGGCTAAACTATTCTTTTTTTGAACTTGGCTTTGTTTACAGTCCACTTAAAATATTTAGACTCTGGCTGGGTTAGACAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTTATTTACATCCTATAAAATAAAGTGTTATTGGCAATATAGCAGCTTCCTAGCTCCATTTTAACTATTGACTGGGGTAACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATTGAGTGGTACACAGGAATGGCGGAGCTCAGTCAGACCGCTGGGCCATCTAGCCCAAGGTTCTGCCTTTGCAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGATTGAGTGCTCCATCCCATCATCCACTCCCAGCTTCTGGAAGTCAGAGGTCTAGA

1 271 518 518
2 271 518 518
3 271 518 518
4 0 518 518

1 0 256 256
2 0 256 256
3 0 256 256
4 0 256 256

4 0 679 679

1 0 258 258
2 0 258 258
4 0 258 258

1 152 501 501
2 152 390 501
3 152 390 501
4 0 501 501

1 0 717 717
2 0 244 717
2 502 717 717
3 0 717 717
4 0 244 717

1 405 622 809
2 0 622 809
4 405 809 809

1 387 617 617
2 387 617 617
4 0 617 617

1 1 233 233
2 1 233 233
3 0 233 233
4 1 233 233

1 349 605 605
2 349 605 605
3 349 605 605
4 0 605 605

1 0 498 498
2 0 498 498
3 0 498 498
4 0 498 498

1 577 1180 1180
2 340 1180 1180
3 571 1180 1180
4 0 1180 1180
CATGAAGGGCTGGTGCAACCATTTAGGCGACCGCCTAGGGCACAAAGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGCAGCAAGTAGGGGGGGGGAGGCGGCACGCGGGGGAGCCCCCCCCCCCAGCTCCCCCCTGCCCCGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCTCCCACCTCCCAGGCTTGTGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGGGGGGCGCAAGGTGGAAGTTTCACCTAGGGTGCGAAACATCTTTGCACCGGCCCTGCATCCATGACATGAGAAGTTGTGTCTTTTGGTCCCTGGGTGTCTGCCACATCAACACCACCACCCAATAGCTTTGGGGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCTGATTGGCAGTGCTTCCTAGTTCCCTGATTGATCCAGATGCTCTATTTAAGCCTGGAGGGGACAGAAGAAAATTTTCTGTACAAGTAGGTGGATCCTCAGCTGGCTGGTTTGACAGCACCTGCTCCTTTGCATTGCTTTGGCTCCTNNNNNNNNNNNNNNNNNNNNNCCCTTAGCTCTGGTCCCAGGCCCCTTACTCCACTTTGCCTACTAAGCTGAAGCACCCGTGCCCTAGCCATGACAGCTGGTAGCTGTGGCAAGATTGAAATGCAATAC

1 11 248 248
2 11 248 248
3 11 248 248
4 0 248 248

1 135 510 510
2 135 510 510
4 0 510 510

1 0 233 233
2 0 233 233
3 0 233 233
4 0 233 233

1 0 462 462
2 0 220 462
3 0 316 462

1 0 363 363
2 0 256 363
3 0 256 363
4 0 256 363

1 110 352 352
2 0 352 352
3 110 352 352
4 110 352 352

1 0 218 218
4 1 216 218

1 0 534 908
2 275 908 908
3 275 534 908
4 275 534 908

1 0 579 676
2 345 579 676
3 204 579 676
4 345 676 676

1 0 486 486
2 0 486 486
3 0 486 486
4 0 486 486

1 0 232 368
2 0 232 368
3 0 368 368
4 0 232 368

1 0 817 817

1 88 309 309
2 0 309 309
3 88 309 309
4 81 309 309

2 83 296 296
3 0 296 296

2 0 256 256
4 0 256 256

1 0 248 248
2 0 248 248
3 0 248 248
4 0 248 248

1 0 779 943
2 0 779 943
3 0 943 943
4 0 943 943

1 0 232 660
2 0 232 660
3 0 660 660

4 0 249 249

1 195 444 444
2 195 444 444
3 0 444 444
4 0 444 444

1 0 821 960
2 0 1059 960
3 0 1148 960
4 0 1059 960

2 184 711 711
4 0 453 711

1 0 755 755

1 0 249 490
2 0 249 490
3 0 490 490
4 0 249 490

1 0 612 612
3 0 612 612
4 0 612 612

4 0 259 259

1 0 419 419
2 0 246 419
3 0 246 419
4 0 419 419

1 0 232 515
2 0 470 515
3 0 232 515
4 0 515 515

1 0 725 725
2 220 438 725
3 0 438 725
4 104 438 725

1 0 248 248
2 0 248 248
3 0 248 248
4 0 248 248

2 0 465 465
4 0 465 465

3 0 429 429

1 0 844 844
2 0 224 844
2 611 844 844
3 0 224 844
3 611 844 844
4 0 224 844
4 449 844 844

1 201 441 441
2 201 441 441
3 201 441 441
4 0 441 441

1 121 353 353
2 121 353 353
3 121 353 353
4 0 353 353

1 0 253 540
2 0 253 540
3 0 540 540
4 0 253 540

1 644 1720 1604
2 644 1720 1604
3 133 1162 1604
CATGATTGCATCAAAAACCTACCACAAGAGTTCAAAGTTAGGGGAGANNNNNNNNNATACTCCTTCTCCAGGGAGAGCCAGTAAATGGCAGTTTAACTGCCTTATTAACAGCAACCCTTGTATCTGCCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTAGTTACCTTGAAGATAGGGTCTAGGATTAGCTGGCAGAAAGTCCTGGGCAGTTTCCTTCCATCAGNNNNNNNNNNNNNNNNNNNNNNNNTGCCAGAAGCAGGATCAAAATACCTAAAAAAAAAATCCAGTCAGAGGGGATTGTGGTTTCGCCATCTTGTTTAATTAGCACTGCTTATTCTGCTTGCCTTTCAAGATGGAGGGAAGGGTTATCAAGTCAAAGACCCGTTTTGAGATGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTATCGTGTATTCCTGATCCCATCCCCTAATATTTGTCTGATTAAATCTAGATTTAAAGTCCCTCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAGGATTCCTTCCTTTGGGTGCTACCATACTGTTGCTTCCTTACTCCCAGCCCAGGAGAGCTCAAGTATTCCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAA
4 0 1720 1604
CATGTCTCTTCCCAAGGCAATGCCTATCAGTTCTTACCTTCAGTAGGGGTTTGCCCTCCTTGTCTTTATCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGATTGCATCAAAAACCTACCACAAGAGTTCAAAGTTAGGGGAGAAAGCAGCTCATACTCCTTCTCCAGGGAGAGCCAGTAAATGGCAGTTTAACTGCCTTATTAACAGCAACCCTTGTATCTGCCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGTAGTTACCTTGAAGATAGGGTCTAGGATTAGCTGGCAGAAAGTCCTGGGCAGTTTCCTTCCATCAGNNNNNNNNNNNNNNNNNNNNNNNTTTCCAGAAGCAGGATCAAAATACCTNAAAAAAAAATCCAGTCAGAGGGGATTGTGGTTTCGCCATCTTGTTTAATTAGCACTGCTTATTCTGCTTGCCTTTCAAGATGGAGGGAAGGGTTATCAAGTCAAAGACCCGTTTTGAGATGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTATCGTGTATTCCTGATCCCATCCCCTAATATTTGTCTGATTAAATCTAGATTTAAAGTCCCTCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAGGATTCCTTCCTTTGGGTGCTACCATACTGTTGCTTCCTTACTCCCAGCCCA

1 105 331 331
2 105 329 331
3 0 329 331
4 105 329 331

1 0 252 593
2 0 252 593
3 0 593 593
4 0 252 593

3 0 473 473

2 0 231 231
3 0 231 231

1 0 211 211
2 0 211 211

2 0 831 831
4 568 831 831

2 201 427 427
3 201 427 427
4 0 427 427

1 1 247 247
2 0 247 247
3 1 247 247
4 1 247 247

1 672 889 889
2 315 889 889
4 0 447 889

1 830 1085 1085
4 0 1085 1085
AATTTTATACCCGGGCCACAAAGAGGCGTCATTTTGCGGGGTCAGCTACAGCCCAGCAATATCATCTGCAGTGATACTTCCAGCTGTTCTCCTGCCTTGCTAGGCTGCAAGACTTCTCTGTGATTCATGNNNNNNNNNNNNNNNNNNNNNNNNNNAATTTTAACCAACCAACCAGCCCATCCTCCTCCTCCTCCATCTTTGGGCAGGCAGCGTCTTTGACTTGCATATCGAGAGCAGCTTACCAGTTGTGACCATCTCAGTTCGGGGACCGGCTGTCTCACTTCTGCAGTGCATGGGAAGCAATAACTACGGACAGATGGGTCCTAACCACTGTGGGGTCAGGTTATGTCATCCAACTTCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTATCAGGGAAAGGGATTCTGCTCCTGGCACTTCCTGATCCCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGTGGTGGTTTTTCCAGACCACAGCAAGTTTCCTAATCATCAGTCAACATTATCAATACACTGGTCTCCNNNNNNNTCTGTCATCAGCCCCACGTGTCTTTAGTGAATGTCTGATGATGGTTGTAGTGACTTATCCCCAATGGAGGGAAATTCATGTCCTCCCATACCTAGACAACTGGCTAGTGAGGGGCAGATCCAAAAAATAAGTCCTCAGTCCTCTCCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

1 0 411 411
2 191 411 411
3 0 411 411
4 191 411 411

1 0 503 503
2 0 503 503
3 0 503 503
4 0 503 503

1 114 352 427
2 114 352 427
3 0 427 427
4 114 427 427

1 0 396 541
2 0 541 541
3 146 477 541
4 146 396 541

1 0 730 730
2 84 545 730
3 0 631 730
4 0 545 730

1 0 256 257
2 0 256 257
3 0 257 257
4 0 256 257

1 0 325 325
2 72 325 325
3 72 325 325
4 2 325 325

2 0 257 257
4 0 257 257

1 0 566 566
2 0 566 566
3 0 566 566
4 0 566 566

1 0 427 661
2 195 661 661
3 195 661 661
4 195 427 661

1 0 216 414
2 0 414 414
3 0 414 414

1 0 829 991
2 409 829 991
3 594 829 991
4 594 991 991

1 0 263 960
1 612 828 960
2 0 989 960
3 0 263 960
3 612 828 960
4 0 263 960

1 177 417 624
2 0 417 624
3 0 624 624
4 177 417 624

1 305 1351 1351
CATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCGCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAAGTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGGCAGGGGGGGCTGCAAGGTTATTGTAGTGAGGGCACAGTATTGTCACCCTTACTTTTGTGCTGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
2 305 562 1351
2 1099 1351 1351
3 0 1351 1351
AATTTTAGCATTCACATTGTTTTTTCCCTTTTACTGCTGTGTCAAAGATGAAAACAGCTGAAGGATGACTTAAAGATCCAGCTGAATGGTATCATTTACCGCTGTACTCTGGGACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCGCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAANTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCACCAGCAAAGGGATTCATTACTTGTCCTTCAGTTTCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
4 305 1351 1351
CATGGGCTCCAGTCCAAGCCCAAATATCTCCACCACACGTTTACACCRCCGCAGCACGAGCCCAAGTTAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAACTGTTCTGACACACAACTGCGTGCAAAGCTCTTTGAGATACTCAGGCCAAAGGCTCTATTAARTACAAAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTGCTACCCAAACTGAACTGACTCTCTGGTATCAATGGCTCTGTTTGAACGTAATAAGATGTTATAGGGTTGGGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCACCAGCAAAGGGATTCATTACTTGTCCTTCAGTTTCTGGCACCAGGAAAATGTAAATACATTTCACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGGGCAGGGGGGGCTGCAAGGTTATTGTAGTGAGGGCACAGTATTGTCACCCTTACTTTTGTGCTGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

1 0 464 464
2 0 464 464
3 0 464 464
4 0 464 464

1 236 570 960
2 0 570 960
3 339 570 960
4 0 1241 960

1 447 873 873
2 247 873 873
3 0 873 873
4 651 873 873

1 0 412 412
2 0 235 412
3 0 235 412
4 0 235 412

1 0 810 810

1 0 1239 960
4 0 589 960

2 0 247 839
3 0 839 839
4 0 247 839

1 359 577 577
2 0 577 577
4 14 577 577

1 0 230 841
1 620 841 841
2 0 417 841
2 620 841 841
3 0 841 841
4 0 841 841

1 0 237 237
2 0 237 237
3 0 237 237
4 0 237 237

1 0 498 498
2 0 234 498
3 0 234 498
4 0 234 498

2 0 300 300

1 0 240 385
2 0 240 385
3 0 385 385
4 0 240 385

1 0 523 561
2 0 249 561
3 0 561 561
4 0 249 561

1 191 420 420
2 183 420 420
4 0 420 420

2 0 259 259
3 0 259 259
4 0 259 259

1 0 253 253
2 0 253 253
3 0 253 253
4 0 253 253

1 0 246 246
2 0 246 246
3 0 246 246
4 0 246 246

1 224 568 568
2 342 568 568
3 0 568 568
4 224 568 568

1 0 226 227
2 0 226 227
3 0 226 227
4 0 227 227

1 49 286 286
2 49 286 286
3 0 286 286
4 49 286 286

1 0 228 228
2 0 228 228
3 0 228 228
4 0 228 228

1 0 553 554
2 0 554 554
3 0 241 554
4 0 554 554

1 187 715 715
2 0 715 715
3 187 715 715
4 187 715 715

1 0 236 236
2 0 236 236
3 0 236 236
4 0 236 236

1 0 473 473

1 299 766 766
2 0 766 766
3 0 766 766
4 0 766 766

1 0 247 247
3 0 247 247
4 123 247 247

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-34-bf84d231351c> in <module>()
----> 1 remote_build_ref_clusters(self)

<ipython-input-33-22b5bd623ae0> in remote_build_ref_clusters(self)
     66         print("")
     67         clust = []
---> 68         avars = refvars(arr.view(np.uint8), PSEUDO_REF)
     69         dat = b"".join(avars.view("S1")[:, 0]).decode()
     70         snpstring = "".join(["*" if i else " " for i in avars[:, 1]])

ValueError: cannot assign slice from input of different size

In [12]:

"use bedtools to pull in clusters and match catgs with alignments"
cmd1 = [
    ipyrad.bins.bedtools,
    "bamtobed",
    "-i", 
    os.path.join(
        data.dirs.across,
        "{}.cat.sorted.bam".format(self.data.name)
        )
]
" ".join(cmd1)

Out[12]:

'/home/deren/local/src/ipyrad/bin/bedtools-linux-x86_64 bamtobed -i /home/deren/local/src/ipyrad/tests/6-tortas/test_across/test.cat.sorted.bam'

In [13]:

cmd2 = [
    ipyrad.bins.bedtools, 
    "merge", 
    "-d", "0",
    "-i", "-",
]

" ".join(cmd2)

Out[13]:

'/home/deren/local/src/ipyrad/bin/bedtools-linux-x86_64 merge -d 0 -i -'

In [35]:

self.remote_build_ref_clusters()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-35-94508fef440e> in <module>()
----> 1 self.remote_build_ref_clusters()

~/local/src/ipyrad/ipyrad/assemble/cluster_across.py in remote_build_ref_clusters(self)
    594             arr = np.zeros((len(rdict), len(read.seq)), dtype=bytes)
    595             for idx, key in enumerate(keys):
--> 596                 arr[idx] = list(rdict[key])
    597 
    598             # get consens seq and variant site index

ValueError: cannot copy sequence with size 288 to array axis with dimension 371

In [ ]:

In [5]:

test.run("6", ipyclient=ipyclient)

Assembly: test
[####################] 100% 0:00:00 | concatenating bams   | s6 |
[####################] 100% 0:00:00 | building clusters    | s6 |

  Encountered an unexpected error (see ./ipyrad_log.txt)
  Error message is below -------------------------------
cannot copy sequence with size 288 to array axis with dimension 371

In [4]:

test2 = test.branch("test2")

In [5]:

from ipyrad.assemble.consens_se import *

In [6]:

self = Step5(test2, True, ipyclient)

In [7]:

self.remote_calculate_depths()
self.remote_make_chunks()

[####################] 100% 0:00:16 | calculating depths   | s5 |
[####################] 100% 0:00:37 | chunking clusters    | s5 |

In [8]:

statsdicts = self.remote_process_chunks()

[####################] 100% 0:14:53 | consens calling      | s5 |

In [9]:

sample = self.samples[0]
data = self.data

In [10]:

# write sequences to SAM file
combs1 = glob.glob(os.path.join(
    data.tmpdir,
    "{}_tmpcons.*".format(sample.name)))
combs1.sort(key=lambda x: int(x.split(".")[-1]))

# open sam handle for writing to bam
samfile = sample.files.consens.replace(".bam", ".sam")    
with open(samfile, 'w') as outf:

    # parse fai file for writing headers
    fai = "{}.fai".format(data.paramsdict["reference_sequence"])
    fad = pd.read_csv(fai, sep="\t", names=["SN", "LN", "POS", "N1", "N2"])
    headers = ["@HD\tVN:1.0\tSO:coordinate"]
    headers += [
        "@SQ\tSN:{}\tLN:{}".format(i, j)
        for (i, j) in zip(fad["SN"], fad["LN"])
    ]
    outf.write("\n".join(headers) + "\n")

    # write to file with sample names imputed to line up with catg array
    counter = 0
    for fname in combs1:
        with open(fname) as infile:
            # impute catg ordered seqnames 
            data = infile.readlines()
            fdata = []
            for line in data:
                name, chrom, rest = line.rsplit(":", 2)
                fdata.append(
                    "{}_{}:{}:{}".format(name, counter, chrom, rest)
                    )
                counter += 1
            outf.write("".join(fdata) + "\n")

In [12]:

cmd = [ip.bins.samtools, "view", "-Sb", samfile]
with open(sample.files.consens, 'w') as outbam:
    proc = sps.Popen(cmd, stderr=sps.PIPE, stdout=outbam)
    comm = proc.communicate()[1]
if proc.returncode:
    raise IPyradError("error in samtools: {}".format(comm))

---------------------------------------------------------------------------
IPyradError                               Traceback (most recent call last)
<ipython-input-12-64d33fc863e2> in <module>()
      4     comm = proc.communicate()[1]
      5 if proc.returncode:
----> 6     raise IPyradError("error in samtools: {}".format(comm))

IPyradError: error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\n[W::sam_read1] parse error at line 10690\n[main_samview] truncated file.\n'

In [14]:

368852-370424 

Out[14]:

-1572

In [8]:

self.remote_concatenate_chunks()
self.data_store(statsdicts)

[####################] 100% 0:01:20 | indexing alleles     | s5 |

---------------------------------------------------------------------------
IPyradError                               Traceback (most recent call last)
<ipython-input-8-b7d045fddf04> in <module>()
----> 1 self.remote_concatenate_chunks()
      2 self.data_store(statsdicts)

~/local/src/ipyrad/ipyrad/assemble/consens_se.py in remote_concatenate_chunks(self)
    315             if not job.successful():
    316                 ip.logger.error("error in step 5: %s", job.exception())
--> 317                 raise IPyradError(job.exception())
    318 
    319 

IPyradError: IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\n[W::sam_read1] parse error at line 10690\n[main_samview] truncated file.\n')

In [ ]:

shutil.rmtree(self.data.tmpdir)
self.data.save()

In [ ]:

In [5]:

from ipyrad.assemble.cluster_across import *

In [7]:

self = Step6(test, True, ipyclient)

In [40]:

self.remote_concat_bams()

[####################] 100% 0:00:56 | concatenating bams   | s6 |

In [39]:

self.remote_build_ref_regions()

[####################] 100% 0:00:13 | building clusters    | s6 |

In [41]:

self.regions[:10]

Out[41]:

[('Contig0', 3480, 3747),
 ('Contig0', 24390, 24761),
 ('Contig0', 24903, 25488),
 ('Contig0', 30442, 30712),
 ('Contig0', 33784, 34013),
 ('Contig0', 37975, 38628),
 ('Contig0', 40387, 40730),
 ('Contig0', 41782, 42162),
 ('Contig0', 44567, 44783),
 ('Contig0', 47550, 47763)]

In [95]:

# access reads from bam file using pysam
bamfile = AlignmentFile(
    os.path.join(
        self.data.dirs.across,
        "cat.sorted.bam"),
    'rb')

# catcons output file for raw clusters and db samples
outfile = gzip.open(
    os.path.join(
        self.data.dirs.across,
        "{}.catcons.gz".format(self.data.name)),
    'wb')

# write header line to catcons with sample names
snames = ['ref'] + sorted([i.name for i in self.samples])
outfile.write(
    b" ".join([i.encode() for i in snames]) + b"\n")

for region in self.regions[:100]:
    reads = bamfile.fetch(*region)

    # get reference sequence
    refn, refs = get_ref_region(
        self.data.paramsdict["reference_sequence"],
        region[0], region[1] + 1, region[2] + 1
        )

    # build cluster dict for sorting      
    rdict = {}
    for read in reads:
        rdict[read.qname] = read.seq   
    keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])

    # make an empty array
    arr = np.zeros((len(snames), len(refs)), dtype=bytes)

    # fill the array
    arr[0] = list(refs)
    for idx, key in enumerate(keys):
        # get start and stop from this seq
        sname = key.rsplit("_", 1)[0]
        rstart, rstop = key.rsplit(":", 2)[-1].split("-")
        sidx = snames.index(sname)

        # get start and stop relative to reference region
        start = int(rstart) - int(region[1]) - 1
        stop = start + int(rstop) - int(rstart)
        
        print(key)
        print(rstart, rstop, start, stop, len(rdict[key]), len(refs))
        arr[sidx, start: stop] = list(rdict[key])
        print("")

    arr[arr == b""] = b"N"
    clusts.append(b"\n".join([b"".join(line) for line in arr]))

AGO09concat_0:1:3481-3748
3481 3748 0 267 267 267

PZ03concat_0:1:3481-3748
3481 3748 0 267 267 267

LP08concat_0:1:24391-24679
24391 24679 0 288 288 371

PBR_B125concat_0:1:24391-24762
24391 24762 0 371 371 371

AGO09concat_1:1:24904-25489
24904 25489 0 585 585 585

PBR_B125concat_1:1:25020-25249
25020 25249 116 345 229 585

PZ03concat_1:1:30443-30713
30443 30713 0 270 270 270

AGO09concat_2:1:33785-34014
33785 34014 0 229 229 229

LP08concat_1:1:33785-34014
33785 34014 0 229 229 229

PBR_B125concat_2:1:33785-34014
33785 34014 0 229 229 229

PZ03concat_2:1:33785-34014
33785 34014 0 229 229 229

PZ03concat_3:1:37976-38629
37976 38629 0 653 653 653

AGO09concat_3:1:40396-40633
40396 40633 8 245 237 343

LP08concat_2:1:40388-40633
40388 40633 0 245 245 343

PBR_B125concat_3:1:40396-40633
40396 40633 8 245 237 343

PZ03concat_4:1:40388-40731
40388 40731 0 343 343 343

AGO09concat_4:1:41783-42163
41783 42163 0 380 380 380

LP08concat_3:1:41920-42163
41920 42163 137 380 243 380

PBR_B125concat_4:1:41920-42163
41920 42163 137 380 243 380

PZ03concat_5:1:41920-42163
41920 42163 137 380 243 380

AGO09concat_5:1:44568-44784
44568 44784 0 216 216 216

LP08concat_4:1:44568-44784
44568 44784 0 216 216 216

PZ03concat_6:1:44568-44784
44568 44784 0 216 216 216

PBR_B125concat_5:1:47551-47764
47551 47764 0 213 213 213

AGO09concat_6:1:47992-48229
47992 48229 0 237 237 237

PBR_B125concat_6:1:49479-50005
49479 50005 0 526 526 526

AGO09concat_7:1:56089-56333
56089 56333 111 355 244 355

LP08concat_5:1:56089-56333
56089 56333 111 355 244 355

PBR_B125concat_7:1:56089-56333
56089 56333 111 355 244 355

PZ03concat_7:1:55978-56333
55978 56333 0 355 355 355

AGO09concat_8:1:62284-62501
62284 62501 28 245 217 457

LP08concat_6:1:62284-62501
62284 62501 28 245 217 457

PBR_B125concat_8:1:62284-62501
62284 62501 28 245 217 457

PZ03concat_8:1:62256-62713
62256 62713 0 457 457 457

PZ03concat_9:1:64734-65000
64734 65000 0 266 266 266

AGO09concat_9:1:68597-68887
68597 68887 829 1119 290 1301

LP08concat_7:1:68597-69007
68597 69007 829 1239 410 1301

PBR_B125concat_9:1:68597-69069
68597 69069 829 1301 472 1301

PZ03concat_10:1:67768-68840
67768 68840 0 1072 960 1301

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-95-4ea82413ea79> in <module>()
     50         print(key)
     51         print(rstart, rstop, start, stop, len(rdict[key]), len(refs))
---> 52         arr[sidx, start: stop] = list(rdict[key])
     53         print("")
     54 

ValueError: cannot copy sequence with size 960 to array axis with dimension 1072

In [88]:

# avars = refvars(arr.view(np.uint8), PSEUDO_REF)
# dat = b"".join(avars.view("S1")[:, 0]).decode()
# snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
# cname = "ref_{}:{}-{}\n{}".format(*region, dat)
# cname

In [94]:

67768 -68840

Out[94]:

-1072

In [89]:

clusts.append(b"\n".join([b"".join(line) for line in arr]))

Out[89]:

b'AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTCACTGTCCAGCAATAGTCTGCAAGCATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATGTTGCAACCAAGGCTTTTGTATGCCTTGAGGAGGTTTTCCACCAACAACCTGTAGTTGTCTGCCTTGTTGTTTCCGAGAAAATT\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTSAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTTTATTCATTAAATGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGATAGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGGAATCGCTCGCCATGNNNNNCGCTCACTGCTCCGCAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\nAATTTATTATGAAACAAAAGGCNAAAAACNATTATGTACATANTTTAGTCCTATTCAGTGTCTACTCAGCNCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGGATCTCTTGTCNNNNNNNNNNNNNNNNNNNNNNNNATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCNNNNNNNNCGCTCACTGCTCNGCAGTTCNGTGGAAAAAAATCTNNATGAGAGTGCAAAAAATGTATCTTTAGTGACATGTTGCAACCAAGGCTTTTGTATGCCTTGAGAAGGTTTTCCACCAACAACCTGTAGTTGTCTGCCTTGTTGTTTCCGAGAAAATT\nNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'

In [15]:

outfile.close()

In [4]:

test.run('6')

Assembly: test
[####################] 100% 0:00:56 | concatenating bams   | s6 |

  Encountered an unexpected error (see ./ipyrad_log.txt)
  Error message is below -------------------------------
cannot copy sequence with size 288 to array axis with dimension 371

In [26]:

## do cigar building again, this time with orientations
data = self.data
sample = list(data.samples.values())[0]
sample.name

Out[26]:

'AGO02concat'

In [21]:

# get all regions with reads. Generator to yield (str, int, int)
fullregions = bedtools_merge(data, sample).strip().split("\n")
regions = (i.split("\t") for i in fullregions)
regions = [(i, int(j), int(k)) for (i, j, k) in regions]

In [27]:

regs = regions[:20]

In [35]:

# access reads from bam file using pysam
bamfile = AlignmentFile(
    os.path.join(
        data.dirs.refmapping,
        "{}-mapped-sorted.bam".format(sample.name)),
    'rb')

# iterate over all regions
opath = os.path.join(
    data.dirs.clusts, "{}.alt.clustS.gz".format(sample.name))
out = gzip.open(opath, 'wt')
idx = 0
clusters = []
for reg in regs:
    # uncomment and compare against ref sequence when testing
    ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
    reads = bamfile.fetch(*reg)

    # store reads in a dict
    rdict = {}

    # paired-end data cluster building
    if "pair" in data.paramsdict["datatype"]:

        # match paired reads together in a dictionary
        for read in reads:
            print("\t".join(
                str(i) for i in 
                [read.is_read1, read.is_reverse,
                 read.seq[:6], read.seq[-6:], 
                 *reg
                ]))
            if read.qname not in rdict:
                rdict[read.qname] = [read, None]
            else:
                rdict[read.qname][1] = read

        # sort keys by derep number
        keys = sorted(
            rdict.keys(),
            key=lambda x: int(x.split("=")[-1]), reverse=True)

        # build the cluster based on map positions, orientation, cigar
        clust = []
        clust.append("{}\n{}".format('ref', ref[1]))
        for key in keys:
            r1, r2 = rdict[key]
            if r1 and r2:

                #lref = len(ref[1])
                aref = np.array(list(ref[1]))
                lref = reg[2] - reg[1]
                arr1 = np.zeros(lref, dtype="U1")
                arr2 = np.zeros(lref, dtype="U1")
                arr1.fill("-")
                arr2.fill("-")

                # how far ahead of the start does this read begin
                seq = cigared(r1.seq, r1.cigar)
                start = r1.reference_start - reg[1] 
                arr1[start:start + len(seq)] = list(seq)

                seq = cigared(r2.seq, r2.cigar)
                start = r2.reference_start - reg[1] 
                arr2[start:start + len(seq)] = list(seq)

                arr3 = join_arrays(arr1, arr2)
                pairseq = "".join(arr3)

                ori = "+"
                if r1.is_reverse:
                    ori = "-"
                derep = r1.qname.split("=")[-1]
                rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
                clust.append("{}\n{}".format(rname, pairseq))

    # single-end data cluster building
    else:   
        for read in reads:
            rdict[read.qname] = read

        # sort keys by derep number
        keys = sorted(
            rdict.keys(),
            key=lambda x: int(x.split("=")[-1]), reverse=True)

        # build the cluster based on map positions, orientation, cigar
        clust = []
        for key in keys:
            r1 = rdict[key]

            #aref = np.array(list(ref[1]))
            lref = reg[2] - reg[1]
            arr1 = np.zeros(lref, dtype="U1")
            arr1.fill("-")

            # how far ahead of the start does this read begin
            seq = cigared(r1.seq, r1.cigar)
            start = r1.reference_start - reg[1] 
            arr1[start:start + len(seq)] = list(seq)
            aseq = "".join(arr1)

            ori = "+"
            if r1.is_reverse:
                ori = "-"
            derep = r1.qname.split("=")[-1]
            rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
            clust.append("{}\n{}".format(rname, aseq))

    # store this cluster
    if clust:
        clusters.append("\n".join(clust))
        idx += 1

    # if 1000 clusters stored then write to disk
    if not idx % 10000:
        if clusters:
            out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
            clusters = []

# write final remaining clusters to disk
if clusters:
    out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
out.close()

True	False	CATGAG	CTACAG	Contig0	3481	3748
True	False	CATGAG	CTACAG	Contig0	3481	3748
False	True	TTCCAG	AGAATT	Contig0	3481	3748
False	True	TTCCAG	AGAATT	Contig0	3481	3748
True	False	CATGGT	GGACAC	Contig0	4940	5088
False	True	GAGGGA	GCAATT	Contig0	4940	5088
False	False	AATTGG	GGAGTT	Contig0	5481	5571
True	True	CATTTT	GGCATG	Contig0	5481	5571
False	False	AATTGC	GGGATT	Contig0	6267	6397
True	True	ATCCGG	CCCATG	Contig0	6267	6397
True	False	CATGAT	TGTGAA	Contig0	14907	15026
False	True	CGCCCC	AGAATT	Contig0	14907	15026
False	False	AATTCT	CTACAG	Contig0	16737	16933
True	True	GCAGGA	GGCATG	Contig0	16737	16933
True	False	CATGAA	CAGGCA	Contig0	19897	20031
False	True	CTGAGT	TAAATT	Contig0	19897	20031
True	False	CATGGA	TGTTGC	Contig0	23715	23823
False	True	CCCTTG	CAAATT	Contig0	23715	23823
False	False	AATTTA	ACTTCT	Contig0	24391	24679
False	False	AATTAT	AATGGA	Contig0	24391	24679
False	False	AATTTA	CTTGTC	Contig0	24391	24679
True	True	ATTGAT	GCCATG	Contig0	24391	24679
True	True	CGCTCA	GACATG	Contig0	24391	24679
True	True	CGCTCA	GACATG	Contig0	24391	24679
False	False	AATTGA	AGTCTC	Contig0	25290	25447
False	False	AATTGA	AGTCTC	Contig0	25290	25447
True	True	GATCGT	TCCATG	Contig0	25290	25447
True	True	GATTGT	TCCATG	Contig0	25290	25447
False	False	AATTTA	TCTTAA	Contig0	26192	26308
True	True	TATGTT	CACATG	Contig0	26192	26308
True	False	CATGTT	CATAGC	Contig0	28528	28669
False	True	CATAGC	AAAATT	Contig0	28528	28669
True	False	CATGCC	GTTTTA	Contig0	29084	29217
False	True	CGATCT	GCAATT	Contig0	29084	29217
True	False	CATGAG	AAAAAT	Contig0	29454	29541
False	True	GGGGTT	TCAATT	Contig0	29454	29541
False	False	AATTCC	CTTCCT	Contig0	30443	30713
True	True	TGGTAG	CACATG	Contig0	30443	30713
False	False	AATTAG	ATTAAT	Contig0	33785	34014
False	False	AATTAG	ATTAAT	Contig0	33785	34014
True	True	ATAGGC	TGCATG	Contig0	33785	34014
True	True	ATAGGC	TGCATG	Contig0	33785	34014
False	False	AATTTG	TTAGCC	Contig0	34452	34659
True	True	GGCAGC	CTCATG	Contig0	34452	34659
False	False	AATTTC	CGTAGA	Contig0	34452	34659
True	True	TCTAAT	AACATG	Contig0	34452	34659
False	False	AATTAA	TTTCTT	Contig0	37976	38247
False	False	AATTAA	TTTCTT	Contig0	37976	38247
True	True	GAGGCT	CCCATG	Contig0	37976	38247
True	True	GAGGCT	CCCATG	Contig0	37976	38247
True	False	CATGGA	CATTGC	Contig0	38736	38961
False	True	GGTCGG	GCAATT	Contig0	38736	38961
True	False	CATGCA	GTGGTA	Contig0	40396	40633
True	False	CATGCA	GTGGTA	Contig0	40396	40633
True	False	CATACA	GTGGCA	Contig0	40396	40633
False	True	GCTTGT	CAAATT	Contig0	40396	40633
False	True	GCTTGT	CAAATT	Contig0	40396	40633
False	True	GCTTGT	CAAATT	Contig0	40396	40633

In [16]:

## branch to reduce size

In [17]:

## run step 6 until building clusters and check orientations

In [ ]:

self.remote_index_refs()
self.remote_run(
    function=concat_multiple_edits,
    printstr=("concatenating       ", "s3"),
    args=(),
)
self.remote_run(
    function=merge_end_to_end,
    printstr=("join unmerged pairs ", "s3"),
    args=(False, False,),
)
self.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)
self.remote_run(
    function=split_endtoend_reads,
    printstr=("splitting dereps    ", "s3"),
    args=(),
)
self.remote_run(
    function=mapping_reads,
    printstr=("mapping reads       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)

In [ ]:

In [16]:

data.run("123")

Assembly: tortas
[####################] 100% 0:16:40 | loading reads        | s1 |
[####################] 100% 0:00:05 | processing reads     | s2 |
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
  found an error in step2; see ipyrad_log.txt
skipping samples not in state==2:
['AGO02concat', 'AGO08concat', 'AGO09concat', 'AGO11concat', 'AGO12concat', 'AGO14concat', 'AGO18concat', 'AGO20concat', 'AGO23concat', 'AGO32concat', 'CAZ05concat', 'CAZ06concat', 'CAZ07concat', 'CAZ11concat', 'CAZ13concat', 'CAZ14concat', 'CAZ15concat', 'CF01concat', 'CF03concat', 'CF05concat', 'CF09concat', 'CF15concat', 'CF16concat', 'CRU02concat', 'CRU12concat', 'CRU14concat', 'CRU23concat', 'CRU38concat', 'CRU41concat', 'CRU43concat', 'CRU54concat', 'CRU61concat', 'CRU64concat', 'ESP01concat', 'ESP02concat', 'ESP04concat', 'ESP08concat', 'ESP09concat', 'ESP10concat', 'ESP11concat', 'ESP12concat', 'ESP13concat', 'ESP14concat', 'LP01concat', 'LP02concat', 'LP06concat', 'LP08concat', 'LT02concat', 'LT05concat', 'LT06concat', 'LT11concat', 'PBL_H01concat', 'PBL_H126concat', 'PBL_H27concat', 'PBL_H60concat', 'PBL_H86concat', 'PBL_I143concat', 'PBL_I188concat', 'PBR_B125concat', 'PBR_B152concat', 'PBR_D49concat', 'PBR_E18concat', 'PBR_F89concat', 'PBR_G244concat', 'PZ03concat', 'PZ04concat', 'PZ05concat', 'PZ06concat', 'PZ08concat', 'PZ09concat', 'PZ28concat', 'PZ67concat', 'PZ70concat', 'PZ82concat', 'SCR02concat', 'SCR05concat', 'SCR06concat', 'SCR07concat', 'SCR08concat', 'SCR10concat', 'SCR21concat', 'SCR22concat', 'SCR24concat', 'VA05concat', 'VA07concat', 'VA08concat', 'VA1082concat', 'VA373concat', 'VA378concat', 'VA387concat', 'VA423concat', 'VA984concat', 'VD02concat', 'VD05concat', 'VD06concat', 'VD08concat', 'VD13concat', 'VD14concat', 'VD15concat', 'VD23concat', 'VD24concat', 'VD25concat']

  Encountered an unexpected error (see ./ipyrad_log.txt)
  Error message is below -------------------------------
no samples ready for step 3

In [13]:

step.remote_concat_bams()

In [14]:

step.remote_build_ref_regions()

In [27]:

self = step

In [29]:

regs = [next(self.regions) for i in range(10)]
regs

Out[29]:

[('Contig0', 76479, 77014),
 ('Contig0', 79123, 79378),
 ('Contig0', 81338, 81851),
 ('Contig0', 82205, 82795),
 ('Contig0', 83142, 83536),
 ('Contig0', 95644, 96211),
 ('Contig0', 101552, 101787),
 ('Contig0', 103468, 103708),
 ('Contig0', 107537, 108114),
 ('Contig0', 132703, 133571)]

In [139]:

# access reads from bam file using pysam
bamfile = AlignmentFile(
    os.path.join(
        self.data.dirs.across,
        "cat.sorted.bam"),
    'rb')

# catcons output file for raw clusters and db samples
outfile = gzip.open(
    os.path.join(
        self.data.dirs.across,
        "{}.catcons.gz".format(self.data.name)),
    'wb')

# write header line to catcons with sample names
outfile.write(
    b" ".join([i.name.encode() for i in self.samples]) + b"\n")

# get clusters
lidx = 0
clusts = []
# while 1:

#     try:
#         region = next(self.regions)
#         reads = bamfile.fetch(*region)
#     except StopIteration:
#         break

for region in regs:
    reads = bamfile.fetch(*region)
 
    # get reference
    refn, refs = get_ref_region(
        data.paramsdict["reference_sequence"], 
        region[0], region[1]+1, region[2]+1)  
    
    # build cluster dict for sorting                
    rdict = {}
    for read in reads:
        print(read)
        rdict[read.qname] = read.seq   
    keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
    
    # build cluster based on map positions (reads are already oriented)
    arr = np.zeros((len(rdict) + 1, len(refs)), dtype=bytes)
    arr[0] = list(refs)
    for idx, key in enumerate(keys):
        rstart, rstop = key.rsplit(":", 1)[-1].split("-")
        start = max(0, region[1] - (int(rstart) - 1))
        stop = min(len(refs), int(rstop) - int(rstart))
        #print(start, stop, len(rdict[key]), refn)
        #print(rdict[key])
        arr[idx + 1, int(start): int(stop)] = list(rdict[key])  
    arr[arr == b""] = b"N"
    for line in arr:
        outfile.write(line.tostring() + b"\n")
        #print(line.tostring(), file=outfile)
        #print(b"".join(line), file=outfile)
    outfile.write(b"\n")
    
        
outfile.close()

AGO09concat_10:1:76480-76715	0	0	76479	0	235M	-1	-1	235	CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAAYAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATTTCAG	None	[]
AGO08concat_10:1:76480-77015	0	0	76479	0	535M	-1	-1	535	CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAGAATCTCTATAAGCACTATAGGGCTCATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTAACCAGTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT	None	[]
AGO11concat_10:1:76480-76711	0	0	76479	0	231M	-1	-1	231	CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCNGCCATCTATCTACAATATGTATCTAAGCANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT	None	[]
AGO02concat_9:1:76480-76711	0	0	76479	0	231M	-1	-1	231	CATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTACACCTTCTTTTAAYAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT	None	[]
AGO09concat_11:1:79124-79379	0	0	79123	0	255M	-1	-1	255	CATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGTGCTGGNCAAATGGTCGAAGACTTGTTCCAACCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT	None	[]
AGO09concat_12:1:81339-81852	0	0	81338	0	513M	-1	-1	513	CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACKTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG	None	[]
AGO08concat_11:1:81339-81852	0	0	81338	0	513M	-1	-1	513	CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGYTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG	None	[]
AGO11concat_11:1:81339-81852	0	0	81338	0	513M	-1	-1	513	CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCTGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG	None	[]
AGO02concat_10:1:81339-81852	0	0	81338	0	513M	-1	-1	513	CATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATTTGAAAAGTCAGGAAAAGAGCAGAGGCNGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG	None	[]
AGO08concat_12:1:82206-82699	0	0	82205	0	493M	-1	-1	493	CATGCAATGCAACAAGGTCTGGAGCTCCTTAGCGAGCCTTCGAGCCACCCAGTCCCTGAAATACACCCCCTGGTCCCTTTCAGGCTGGATCCAGAATGGAAAGTGTAACACCAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG	None	[]
AGO09concat_13:1:82469-82796	0	0	82468	0	327M	-1	-1	327	AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCTGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGTGCTCATCCAGGCAGGAGCAAACCCTGAGCCCCGGCCACAGGTTCAAAGGTCTCAGCACCAGAGACAGGTGGAAAACCTTTTTCCCGTCCCGCAATT	None	[]
AGO11concat_12:1:82469-82699	0	0	82468	0	230M	-1	-1	230	AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG	None	[]
AGO02concat_11:1:82469-82699	0	0	82468	0	230M	-1	-1	230	AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG	None	[]
AGO09concat_14:1:83143-83367	0	0	83142	0	224M	-1	-1	224	AGAAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCRGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG	None	[]
AGO11concat_13:1:83146-83367	0	0	83145	0	221M	-1	-1	221	AATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCGGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG	None	[]
AGO02concat_12:1:83146-83537	0	0	83145	0	391M	-1	-1	391	AATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCRGTTTTGACGAACTGGTATTTGTTGCTGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATGGCCAGCAAGGCTGGTCCATAGGGGCTGCAGGGCACGGCATTATGTCCTCCGGGGAACTTCTGCAGCGNNNNNNNNNNNNNNNNNNNNNNNNNNNTATTGCCATCCCCGCCTCTGGAACGGGACATAAAACCGGCTAGCTGGCAGGATGTCCTGAGCGCCCTGGCAGAATT	None	[]
AGO09concat_15:1:95645-95887	0	0	95644	0	242M	-1	-1	242	AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG	None	[]
AGO08concat_13:1:95645-95887	0	0	95644	0	242M	-1	-1	242	AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTASGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG	None	[]
AGO11concat_14:1:95645-96212	0	0	95644	0	567M	-1	-1	567	AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATGGAGGTAAGGAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAAACCTCACAACAGCCTTGAAATCTAGTTCCATAGTCTCGAGCCCTTTGTACAAATAGGTATCCAGAAGCACAGAAACCTCAAGTGACTTGTCTCAGGTCACACCCGTATGGCACTGATGTAAGTGAACATACAGCATG	None	[]
AGO02concat_13:1:95645-96212	0	0	95644	0	567M	-1	-1	567	AATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAAACCTCACAACAGCCTTGAAATCTAGTTCCATCGTCTCGAGCCCTTTGTACAAATAGGTATCCAGAAGCACAGAAACCTCAAGTGACTTGTCTCAGGTCACACCCGTATGGCACTGATGTAAGTGAACATACAGCATG	None	[]
AGO09concat_16:1:101553-101788	0	0	101552	0	235M	-1	-1	235	CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGNCTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT	None	[]
AGO08concat_14:1:101553-101788	0	0	101552	0	235M	-1	-1	235	CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT	None	[]
AGO11concat_15:1:101553-101788	0	0	101552	0	235M	-1	-1	235	CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT	None	[]
AGO02concat_14:1:101553-101788	0	0	101552	0	235M	-1	-1	235	CATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT	None	[]
AGO09concat_17:1:103469-103709	0	0	103468	0	240M	-1	-1	240	CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT	None	[]
AGO08concat_15:1:103469-103709	0	0	103468	0	240M	-1	-1	240	CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT	None	[]
AGO11concat_16:1:103469-103709	0	0	103468	0	240M	-1	-1	240	CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT	None	[]
AGO02concat_15:1:103469-103709	0	0	103468	0	240M	-1	-1	240	CATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT	None	[]
AGO02concat_16:1:107538-108115	0	0	107537	0	577M	-1	-1	577	CATGTTTAATCCTGAGAAGAGAAGACCAAGGGGGGACCCAATAACAGTCTTCAAATCTGTTAAGGGCTGTTATAGACAGGACGGTGATCAATTNNNNNNCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGGAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT	None	[]
AGO09concat_18:1:107637-108115	0	0	107636	0	478M	-1	-1	478	CATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGTAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT	None	[]
AGO08concat_16:1:107637-108115	0	0	107636	0	478M	-1	-1	478	CATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT	None	[]
AGO11concat_17:1:107637-108115	0	0	107636	0	478M	-1	-1	478	CATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATCTGCAGCAAGGGAGATACAGGGTAGATATTANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT	None	[]
AGO09concat_19:1:132704-133572	0	0	132703	0	868M	-1	-1	868	CATGTATTAAACTTGAACACCTGCCCTTGCACTAGTAGGATTTTATGTTTGTATTTTGCATAACTTAGAATAAATAATAATGATATGATTTAACCGTATCCTGCCAGCCACCTTGGTAGAAATGCATCAGCCAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAATTCTGTTTTGTGTGNGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCNGTAATCAAAACCAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAAACTTTGGGTTCATCCTGCCAAGACTTCCCCAGAGCGTAGTGTCGTGATCGACAGAACCCAGCTCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACTCTGGACTAGTAACTATAACATCGACTGGTGGGAGAGTGTGCGTGTGAGTGATTGAATGAATGTGCTAATT	None	[]
AGO02concat_17:1:132704-133221	0	0	132703	0	517M	-1	-1	517	CATGTATTAAACTTGAACACCTGCCCTTGCACTAGTAGGATTTTATGTTTGTATTTTGCATAACTTAGAATAAATAATAATGATATGATTTAACCGTATCCTGCCAGCCACCTTGGTAGAAATGCATCAGCCAGAATTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG	None	[]
AGO11concat_18:1:132993-133572	0	0	132992	0	579M	-1	-1	579	AATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGAAACTTTGGGTTCATCCTGCCAAGACTTCCCCAGAGCGTAGTGTCGTGATCGACAGAACCCAGCTCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGGACTCTGGACTAGTAACTATAACATCGACTGGTGGGAGAGTGTGCGTGTGAGTGATTGAATGAATGTGCTAATT	None	[]

In [60]:

region[1]

Out[60]:

In [141]:

revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")

Out[141]:

'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTTGTATTAGAGATGGGGCTGAAGTCAAACCCTGACCTGAACAGTGAGAAAATCTGGTGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCCTCTGCCTCCCTCAAGAAGAAATCAAGAGKGGAAAGGAGCAGGGCGGGAGGTATGGGAAAGGAAGAATGGAATT'

In [37]:

    # get consens seq and variant site index 
    clust = []
    avars = refvars(arr.view(np.uint8), PSEUDO_REF)
    dat = b"".join(avars.view("S1")[:, 0]).decode()
    snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
    clust.append("ref_{}:{}-{}\n{}".format(*region, dat))

    # or, just build variant string (or don't...)
    # write all loci with [locids, nsnps, npis, nindels, ?]
    for key in keys:
        clust.append("{}\n{}".format(key, rdict[key]))
    clust.append("SNPs\n" + snpstring)
    clusts.append("\n".join(clust))

    # advance locus counter
    lidx += 1

    # write chunks to file
    if not lidx % 1000:
        outfile.write(
            str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
        clusts = []

# write remaining
if clusts:                
    outfile.write(
        str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-37-7858eebe5047> in <module>()
     43     arr = np.zeros((len(rdict), len(refs)), dtype=bytes)
     44     for idx, key in enumerate(keys):
---> 45         arr[idx] = list(rdict[key])
     46 
     47     # get consens seq and variant site index

ValueError: cannot copy sequence with size 231 to array axis with dimension 535

In [16]:

step.remote_build_ref_clusters()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-16-2a0e5422841e> in <module>()
----> 1 step.remote_build_ref_clusters()

~/Documents/ipyrad/ipyrad/assemble/cluster_across.py in remote_build_ref_clusters(self)
    588             arr = np.zeros((len(rdict), len(read.seq)), dtype=bytes)
    589             for idx, key in enumerate(keys):
--> 590                 arr[idx] = list(rdict[key])
    591 
    592             # get consens seq and variant site index

ValueError: cannot copy sequence with size 543 to array axis with dimension 559

In [4]:

from ipyrad.assemble.clustmap import *

In [6]:

self = Step3(data, 8, True, ipyclient)

In [7]:

self.run()

[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:05:42 | join unmerged pairs  | s3 |
[####################] 100% 0:02:17 | dereplicating        | s3 |
[####################] 100% 0:00:39 | splitting dereps     | s3 |
[####################] 100% 1:53:03 | mapping reads        | s3 |
[####################] 100% 1:40:12 | building clusters    | s3 |
[####################] 100% 0:00:24 | calc cluster stats   | s3 |

In [8]:

self.data.run("45")

Assembly: 5-tortas
[####################] 100% 0:15:41 | inferring [H, E]     | s4 |
[####################] 100% 0:00:26 | calculating depths   | s5 |
[####################] 100% 0:00:45 | chunking clusters    | s5 |
[####################] 100% 16:10:59 | consens calling      | s5 |
[####################] 100% 0:01:31 | indexing alleles     | s5 |

In [13]:

self.remote_index_refs()
self.remote_run(
    function=concat_multiple_edits,
    printstr=("concatenating       ", "s3"),
    args=(),
)
self.remote_run(
    function=merge_end_to_end,
    printstr=("join unmerged pairs ", "s3"),
    args=(False, False,),
)

[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:05:49 | join unmerged pairs  | s3 |

In [15]:

self.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)

[####################] 100% 0:02:22 | dereplicating        | s3 |

In [16]:

self.remote_run(
    function=split_endtoend_reads,
    printstr=("splitting dereps    ", "s3"),
    args=(),
)

[####################] 100% 0:00:36 | splitting dereps     | s3 |

In [ ]:

self.remote_run(
    function=mapping_reads,
    printstr=("mapping reads       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)

[##########          ]  50% 1:18:01 | mapping reads        | s3 |

In [ ]:

sample = list(self.data.samples.values())[0]
merge_end_to_end(self.data, sample, True, True)

In [ ]:

In [15]:

sample = list(data.samples.values())[0]

infiles = [
    os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
    os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
    os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
    os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]

infile

Out[15]:

'/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq'

In [19]:

strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
    strand = "both"
nthreads=2

cmd = [
    ip.bins.vsearch,
    "--derep_fulllength", infile,
    "--strand", strand,
    "--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
    "--threads", str(nthreads),
    "--fasta_width", str(0),
    "--fastq_qmax", "1000",
    "--sizeout", 
    "--relabel_md5",
]
cmd

Out[19]:

['/home/deren/Documents/ipyrad/bin/vsearch-linux-x86_64',
 '--derep_fulllength',
 '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq',
 '--strand',
 'plus',
 '--output',
 '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_derep.fastq',
 '--threads',
 '2',
 '--fasta_width',
 '0',
 '--fastq_qmax',
 '1000',
 '--sizeout',
 '--relabel_md5']

In [20]:

proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
    ip.logger.error("error inside dereplicate %s", errmsg)
    raise IPyradWarningExit(errmsg)

In [10]:

s.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(s.nthreads,),
    threaded=True,
)

[                    ]   0% 0:01:55 | dereplicating        | s3 |

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-10-9074a628ea2e> in <module>()
      3     printstr=("dereplicating       ", "s3"),
      4     args=(s.nthreads,),
----> 5     threaded=True,
      6 )

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in remote_run(self, printstr, function, args, threaded)
    598             ready = [rasyncs[i].ready() for i in rasyncs]
    599             self.data._progressbar(len(ready), sum(ready), start, printstr)
--> 600             time.sleep(0.1)
    601             if len(ready) == sum(ready):
    602                 break

KeyboardInterrupt: