In [6]:
import ipyrad as ip
import ipyparallel as ipp
In [7]:
ipyclient = ipp.Client()
/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:458: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='[email protected]')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)
In [3]:
data = ip.Assembly("1-pairtest")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CGG"))
New Assembly: 1-pairtest
In [3]:
data = ip.Assembly("2-setest")
data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "rad")
data.set_params("restriction_overhang", ("TGCAG", ""))
New Assembly: 2-setest
In [3]:
data = ip.Assembly("3-refsetest")
data.set_params("raw_fastq_path", "ipsimdata/rad_example_R1_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "rad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "ipsimdata/rad_example_genome.fa")
data.set_params("restriction_overhang", ("TGCAG", ""))
#data.get_params()
New Assembly: 3-refsetest
In [5]:
data = ip.Assembly("4-refpairtest")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/rad_example_barcodes.txt")
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("restriction_overhang", ("TGCAG", "CGG"))
#data.get_params()
New Assembly: 4-refpairtest
In [8]:
data = ip.Assembly("5-tortas")
data.set_params("project_dir", "tortas")
data.set_params("sorted_fastq_path", "/home/deren/Dropbox/Maud/fastq-concats/*.gz")
data.set_params("datatype", "pairddrad")
data.set_params("assembly_method", "reference")
data.set_params("reference_sequence", "/home/deren/Dropbox/Maud/lgeorge.genome.fa")
data.set_params("restriction_overhang", ("CATG", "AATT"))
data.set_params("filter_adapters", 2)
New Assembly: 5-tortas
In [4]:
data.run("12", force=True)
Assembly: 4-refpairtest
[force] overwriting fastq files previously created by ipyrad.
This _does not_ affect your original/raw data files.
[####################] 100% 0:00:04 | sorting reads        | s1 |
[####################] 100% 0:00:02 | writing/compressing  | s1 |
[####################] 100% 0:00:04 | processing reads     | s2 |
In [11]:
#data.run("12", force=True)
#data = ip.load_json("1-pairtest.json")
#data = ip.load_json("2-setest.json")
#data = ip.load_json("3-refsetest.json")
#data = ip.load_json("4-refpairtest.json")
data = ip.load_json("tortas/5-tortas.json")
loading Assembly: 5-tortas
from saved path: ~/Documents/ipyrad/tests/tortas/5-tortas.json
In [12]:
 
Out[12]:
state reads_raw reads_passed_filter
AGO02concat 2 11050294 10800672
AGO08concat 2 13408401 13030329
AGO09concat 2 15650127 15121047
AGO11concat 2 12848936 12370018
In [9]:
from ipyrad.assemble.cluster_across import *
In [10]:
from ipyrad.assemble.clustmap import *
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
samples = list(s3.data.samples.values())
sample = samples[1]
In [7]:
s3.data.ipcluster["threads"] = 4
s3.run()
[####################] 100% 0:00:01 | concatenating        | s3 |
[####################] 100% 0:00:01 | join unmerged pairs  | s3 |
[####################] 100% 0:00:00 | dereplicating        | s3 |
[####################] 100% 0:00:00 | splitting dereps     | s3 |
[####################] 100% 0:00:02 | mapping reads        | s3 |
[####################] 100% 0:00:10 | building clusters    | s3 |
In [11]:
s3.data.run("45")
Assembly: 4-refpairtest
[####################] 100% 0:00:04 | inferring [H, E]     | s4 |
[####################] 100% 0:00:00 | calculating depths   | s5 |
[####################] 100% 0:00:00 | chunking clusters    | s5 |
[####################] 100% 0:00:25 | consens calling      | s5 |
In [16]:
#s3.data.stats
data = s3.data
jobid = 0
samples = list(data.samples.values())[:4]
randomseed = 123
In [ ]:
 
In [19]:
conshandles = [
    sample.files.consens[0] for sample in samples if 
    sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"
conshandles
Out[19]:
['/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/1B_0.consens.gz',
 '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/1D_0.consens.gz',
 '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/2H_0.consens.gz',
 '/home/deren/Documents/ipyrad/tests/4-refpairtest_consens/3J_0.consens.gz']
In [20]:
## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
    data.dirs.across, 
    "{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
    call = sps.Popen(cmd, stdout=output, close_fds=True)
    call.communicate()

## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
        "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
        "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)
In [27]:
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))

proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz")
with open(allhaps, 'w') as output:
    proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
    proc2.communicate()
proc1.stdout.close()
In [30]:
data.dirs.across = os.path.join(data.name + "_across")
if not os.path.exists(data.dirs.across):
    os.makedirs(data.dirs.across)
    
import ipyrad
In [31]:
conshandles = [
    sample.files.consens[0] for sample in samples if 
    sample.stats.reads_consens]
conshandles.sort()
assert conshandles, "no consensus files found"

## concatenate all of the gzipped consens files
cmd = ['cat'] + conshandles
groupcons = os.path.join(
    data.dirs.across, 
    "{}-{}-catcons.gz".format(data.name, jobid))
LOGGER.debug(" ".join(cmd))
with open(groupcons, 'w') as output:
    call = sps.Popen(cmd, stdout=output, close_fds=True)
    call.communicate()

## a string of sed substitutions for temporarily replacing hetero sites
## skips lines with '>', so it doesn't affect taxon names
subs = ["/>/!s/W/A/g", "/>/!s/w/A/g", "/>/!s/R/A/g", "/>/!s/r/A/g",
        "/>/!s/M/A/g", "/>/!s/m/A/g", "/>/!s/K/T/g", "/>/!s/k/T/g",
        "/>/!s/S/C/g", "/>/!s/s/C/g", "/>/!s/Y/C/g", "/>/!s/y/C/g"]
subs = ";".join(subs)

## impute pseudo-haplo information to avoid mismatch at hetero sites
## the read data with hetero sites is put back into clustered data later.
## pipe passed data from gunzip to sed.
cmd1 = ["gunzip", "-c", groupcons]
cmd2 = ["sed", subs]
LOGGER.debug(" ".join(cmd1))
LOGGER.debug(" ".join(cmd2))

proc1 = sps.Popen(cmd1, stdout=sps.PIPE, close_fds=True)
allhaps = groupcons.replace("-catcons.gz", "-cathaps.gz")
with open(allhaps, 'w') as output:
    proc2 = sps.Popen(cmd2, stdin=proc1.stdout, stdout=output, close_fds=True)
    proc2.communicate()
proc1.stdout.close()

## now sort the file using vsearch
allsort = groupcons.replace("-catcons.gz", "-catsort.fa")
cmd1 = [ipyrad.bins.vsearch,
        "--sortbylength", allhaps,
        "--fasta_width", "0",
        "--output", allsort]
LOGGER.debug(" ".join(cmd1))
proc1 = sps.Popen(cmd1, close_fds=True)
proc1.communicate()
Out[31]:
(None, None)
In [34]:
from ipyrad.assemble.cluster_across import *

random.seed(randomseed)

## open an iterator to lengthsorted file and grab two lines at at time
allshuf = groupcons.replace("-catcons.gz", "-catshuf.fa")
outdat = open(allshuf, 'wt')
indat = open(allsort, 'r')
idat = izip(iter(indat), iter(indat))
done = 0

chunk = [next(idat)]
while not done:
    ## grab 2-lines until they become shorter (unless there's only one)
    oldlen = len(chunk[-1][-1])
    while 1:
        try:
            dat = next(idat)
        except StopIteration:
            done = 1
            break
        if len(dat[-1]) == oldlen:
            chunk.append(dat)
        else:
            ## send the last chunk off to be processed
            random.shuffle(chunk)
            outdat.write("".join(chain(*chunk)))
            ## start new chunk
            chunk = [dat]
            break

## do the last chunk
random.shuffle(chunk)
outdat.write("".join(chain(*chunk)))

indat.close()
outdat.close()
In [50]:
def build_clusters_from_cigars(data, sample):
    
    # get all regions with reads. Generator to yield (str, int, int)
    fullregions = bedtools_merge(data, sample).strip().split("\n")    
    regions = (i.split("\t") for i in fullregions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    bamfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')

    # iterate over all regions
    opath = os.path.join(
        data.dirs.clusts, "{}.clustS.gz".format(sample.name))
    out = gzip.open(opath, 'wt')
    idx = 0
    clusters = []
    for reg in regions:
        # uncomment and compare against ref sequence when testing
        #ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
        reads = bamfile.fetch(*reg)

        # store reads in a dict
        rdict = {}

        # paired-end data cluster building
        if "pair" in data.paramsdict["datatype"]:
            
            # match paired reads together in a dictionary            
            for read in reads:
                if read.qname not in rdict:
                    rdict[read.qname] = [read, None]
                else:
                    rdict[read.qname][1] = read

            # sort keys by derep number
            keys = sorted(
                rdict.keys(),
                key=lambda x: int(x.split("=")[-1]), reverse=True)

            # build the cluster based on map positions, orientation, cigar
            clust = []
            for key in keys:
                r1, r2 = rdict[key]
                if r1 and r2:

                    #lref = len(ref[1])
                    lref = reg[2] - reg[1]
                    arr1 = np.zeros(lref, dtype="U1")
                    arr2 = np.zeros(lref, dtype="U1")
                    arr1.fill("-")
                    arr2.fill("-")

                    # how far ahead of the start does this read begin
                    seq = cigared(r1.seq, r1.cigar)
                    start = r1.reference_start - reg[1] 
                    arr1[start:start + len(seq)] = list(seq)
                    
                    seq = cigared(r2.seq, r2.cigar)
                    start = r2.reference_start - reg[1] 
                    arr2[start:start + len(seq)] = list(seq)
                    
                    arr3 = join_arrays(arr1, arr2)
                    pairseq = "".join(arr3)

                    ori = "+"
                    if r1.is_reverse:
                        ori = "-"
                    derep = r1.qname.split("=")[-1]
                    rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
                    clust.append("{}\n{}".format(rname, pairseq))

        # single-end data cluster building
        else:   
            for read in reads:
                rdict[read.qname] = read

            # sort keys by derep number
            keys = sorted(
                rdict.keys(),
                key=lambda x: int(x.split("=")[-1]), reverse=True)

            # build the cluster based on map positions, orientation, cigar
            clust = []
            for key in keys:
                r1 = rdict[key]

                #aref = np.array(list(ref[1]))
                lref = reg[2] - reg[1]
                arr1 = np.zeros(lref, dtype="U1")
                arr1.fill("-")

                # how far ahead of the start does this read begin
                seq = cigared(r1.seq, r1.cigar)
                start = r1.reference_start - reg[1] 
                arr1[start:start + len(seq)] = list(seq)
                aseq = "".join(arr1)

                ori = "+"
                if r1.is_reverse:
                    ori = "-"
                derep = r1.qname.split("=")[-1]
                rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
                clust.append("{}\n{}".format(rname, aseq))

        # store this cluster
        clusters.append("\n".join(clust))
        idx += 1

        # if 1000 clusters stored then write to disk
        if not idx % 10:
            out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
            clusters = []
 
    # write final remaining clusters to disk
    out.write("\n//\n//\n".join(clusters) + "\n//\n//\n")
    out.close()
In [51]:
build_clusters_from_cigars(data, sample)
In [1]:
#maxlens, depths = get_quick_depths(data, sample)
In [2]:
#sample_cleanup(data, sample)
In [18]:
def get_quick_depths(data, sample):
    """ iterate over clustS files to get data """

    ## use existing sample cluster path if it exists, since this
    ## func can be used in step 4 and that can occur after merging
    ## assemblies after step3, and if we then referenced by data.dirs.clusts
    ## the path would be broken.
    if sample.files.clusters:
        pass
    else:
        ## set cluster file handles
        sample.files.clusters = os.path.join(
            data.dirs.clusts, sample.name + ".clustS.gz")

    ## get new clustered loci
    fclust = data.samples[sample.name].files.clusters
    clusters = gzip.open(fclust, 'rt')
    pairdealer = izip(*[iter(clusters)] * 2)

    ## storage
    depths = []
    maxlen = []

    ## start with cluster 0
    tdepth = 0
    tlen = 0

    ## iterate until empty
    while 1:
        ## grab next
        try:
            name, seq = next(pairdealer)
        except StopIteration:
            break

        ## if not the end of a cluster
        #print name.strip(), seq.strip()
        if name.strip() == seq.strip():
            depths.append(tdepth)
            maxlen.append(tlen)
            tlen = 0
            tdepth = 0

        else:
            try:
                tdepth += int(name.strip().split("=")[-1][:-2])
                tlen = len(seq)
            except:
                print(name)

    ## return
    clusters.close()
    return np.array(maxlen), np.array(depths)
In [14]:
def sample_cleanup(data, sample):
    """ stats, cleanup, and link to samples """

    ## get maxlen and depths array from clusters
    maxlens, depths = get_quick_depths(data, sample)

    ## Test if depths is non-empty, but just full of zeros.
    if not depths.max():
        print("    no clusters found for {}".format(sample.name))
        return 

    else:
        ## store which min was used to calculate hidepth here
        sample.stats_dfs.s3["hidepth_min"] = (
            data.paramsdict["mindepth_majrule"])

        # If our longest sequence is longer than the current max_fragment_len
        # then update max_fragment_length. For assurance we require that
        # max len is 4 greater than maxlen, to allow for pair separators.
        hidepths = depths >= data.paramsdict["mindepth_majrule"]
        maxlens = maxlens[hidepths]

        ## Handle the case where there are no hidepth clusters
        if maxlens.any():
            maxlen = int(maxlens.mean() + (2. * maxlens.std()))
        else:
            maxlen = 0
        if maxlen > data._hackersonly["max_fragment_length"]:
            data._hackersonly["max_fragment_length"] = maxlen + 4

        ## make sense of stats
        keepmj = depths[depths >= data.paramsdict["mindepth_majrule"]]
        keepstat = depths[depths >= data.paramsdict["mindepth_statistical"]]

        ## sample summary stat assignments
        sample.stats["state"] = 3
        sample.stats["clusters_total"] = depths.shape[0]
        sample.stats["clusters_hidepth"] = keepmj.shape[0]

        ## store depths histogram as a dict. Limit to first 25 bins
        bars, bins = np.histogram(depths, bins=range(1, 26))
        sample.depths = {int(i): int(v) for i, v in zip(bins, bars) if v}

        ## sample stat assignments
        ## Trap numpy warnings ("mean of empty slice") printed by samples
        ## with few reads.
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            sample.stats_dfs.s3["merged_pairs"] = sample.stats.reads_merged
            sample.stats_dfs.s3["clusters_total"] = depths.shape[0]
            try:
                sample.stats_dfs.s3["clusters_hidepth"] = (
                    int(sample.stats["clusters_hidepth"]))
            except ValueError:
                ## Handle clusters_hidepth == NaN
                sample.stats_dfs.s3["clusters_hidepth"] = 0
            sample.stats_dfs.s3["avg_depth_total"] = depths.mean()
            sample.stats_dfs.s3["avg_depth_mj"] = keepmj.mean()
            sample.stats_dfs.s3["avg_depth_stat"] = keepstat.mean()
            sample.stats_dfs.s3["sd_depth_total"] = depths.std()
            sample.stats_dfs.s3["sd_depth_mj"] = keepmj.std()
            sample.stats_dfs.s3["sd_depth_stat"] = keepstat.std()

    ## Get some stats from the bam files
    ## This is moderately hackish. samtools flagstat returns
    ## the number of reads in the bam file as the first element
    ## of the first line, this call makes this assumption.
    if not data.paramsdict["assembly_method"] == "denovo":
        ## shorter names
        mapf = os.path.join(
            data.dirs.refmapping, sample.name + "-mapped-sorted.bam")
        umapf = os.path.join(
            data.dirs.refmapping, sample.name + "-unmapped.bam")

        ## get from unmapped
        cmd1 = [ip.bins.samtools, "flagstat", umapf]
        proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
        result1 = proc1.communicate()[0]

        ## get from mapped
        cmd2 = [ip.bins.samtools, "flagstat", mapf]
        proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)
        result2 = proc2.communicate()[0]

        ## store results
        ## If PE, samtools reports the _actual_ number of reads mapped, both 
        ## R1 and R2, so here if PE divide the results by 2 to stay consistent
        ## with how we've been reporting R1 and R2 as one "read pair"
        if "pair" in data.paramsdict["datatype"]:
            sample.stats["refseq_unmapped_reads"] = int(result1.split()[0]) / 2
            sample.stats["refseq_mapped_reads"] = int(result2.split()[0]) / 2
        else:
            sample.stats["refseq_unmapped_reads"] = int(result1.split()[0])
            sample.stats["refseq_mapped_reads"] = int(result2.split()[0])

        unmapped = os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam")
        samplesam = os.path.join(data.dirs.refmapping, sample.name + ".sam")
        for rfile in [unmapped, samplesam]:
            if os.path.exists(rfile):
                os.remove(rfile)

    # if loglevel==DEBUG
    log_level = ip.logger.getEffectiveLevel()
    if not log_level == 10:
        ## Clean up loose files only if not in DEBUG
        ##- edits/*derep, utemp, *utemp.sort, *htemp, *clust.gz
        derepfile = os.path.join(data.dirs.edits, sample.name + "_derep.fastq")
        mergefile = os.path.join(data.dirs.edits, sample.name + "_merged_.fastq")
        uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
        usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort")
        hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
        clusters = os.path.join(data.dirs.clusts, sample.name + ".clust.gz")

        for rfile in [derepfile, mergefile, uhandle, usort, hhandle, clusters]:
            if os.path.exists(rfile):
                os.remove(rfile)
In [7]:
# optimize speed of this next
build_clusters_from_cigars(data, sample)

Consensus for refmapped data -- store all (even long pairs)?

In [ ]:
### Write each out to a sam file...
# newconsensus()
# 
data._este = data.stats.error_est.mean()
data._esth = data.stats.hetero_est.mean()

clusters = open(os.path.join(data.dirs.clusts, "{}.clustS.gz".format(sample.name)), 'r')
clusters.read()

# plan to fill an h5 for this sample
tmp5 = consenshandle.replace("_tmpcons.", "_tmpcats.")
with h5py.File(tmp5, 'w') as io5:
    io5.create_dataset("cats", (optim, maxlen, 4), dtype=np.uint32)
    io5.create_dataset("alls", (optim, ), dtype=np.uint8)
    io5.create_dataset("chroms", (optim, 3), dtype=np.int64)

    ## local copies to use to fill the arrays
    catarr = io5["cats"][:]
    nallel = io5["alls"][:]
    refarr = io5["chroms"][:]
In [ ]:
### Step 6 for refmapped pairs: 
# 1. convert all sams to bams and make a merged mapped-sorted.bam
# 2. get overlapping regions with bedtools_merge()
# 3. pull consens reads in aligned regions with bamfile.fetch()
# 4. store the consensus sequence in h5.
# 5. store the variants in h5.
# 6. store the depth of variants in h5.
# 7. 
In [7]:
# get all regions with reads. Generator to yield (str, int, int)
fullregions = bedtools_merge(data, sample).strip().split("\n")    
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)

# access reads from bam file using pysam
samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')
In [34]:
reg = next(regions)
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = samfile.fetch(*reg)
In [35]:
# match paired reads together in a dictionary
rdict = {}
for read in reads:
    rdict[read.qname] = read

# sort keys by derep number
keys = sorted(
    rdict.keys(),
    key=lambda x: int(x.split("=")[-1]), reverse=True)

# build the cluster based on map positions, orientation, cigar
clust = []
for key in keys:
    r1 = rdict[key]

    aref = np.array(list(ref[1]))
    arr1 = np.zeros(aref.size, dtype="U1")
    arr1.fill("-")

    # how far ahead of the start does this read begin
    seq = cigared(r1.seq, r1.cigar)
    start = r1.reference_start - reg[1] 
    arr1[start:start + len(seq)] = list(seq)
    aseq = "".join(arr1)

    ori = "+"
    if r1.is_reverse:
        ori = "-"
    derep = r1.qname.split("=")[-1]
    rname = "{}:{}-{};size={};{}".format(*reg, derep, ori)
    clust.append("{}\n{}".format(rname, aseq))
In [36]:
clust
Out[36]:
['MT:96809-96900;size=18;+\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAGGCGACGATAGAGAACCCCGGCCTA',
 'MT:96809-96900;size=1;+\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAAGCGACGATAGAGAACCCCGGCCTA',
 'MT:96809-96900;size=1;+\nTGCAGCTGCGGTAGTTAACGAACAGCCTGTCTTGTCTAAAGGGTTAAAAATCAGGTCCGGTGTACAGGCGACGACAGAGAACCCCGGCCTA']
In [30]:
rdict
Out[30]:
{'37013bcb5c4c2a2541dacf4f2e807af0;size=16': [<pysam.libcalignedsegment.AlignedSegment at 0x7fa80b8110a8>,
  None]}
In [16]:
mapping_reads(data, sample, 2)
---------------------------------------------------------------------------
IPyradError                               Traceback (most recent call last)
<ipython-input-16-193c6a7f0e62> in <module>()
----> 1 mapping_reads(data, sample, 2)

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in mapping_reads(data, sample, nthreads)
   1498     #  -O = Output file format, in this case bam
   1499     #  -o = Output file name
-> 1500 
   1501     # (cmd5) samtools bam2fq -v 45 [in.bam]
   1502     #   -v45 set the default qscore arbirtrarily high

IPyradError: None
In [ ]:
 
In [9]:
s3.data.run("5")
Assembly: 4-refpairtest
[####################] 100% 0:00:00 | calculating depths   | s5 |
[####################] 100% 0:00:00 | chunking clusters    | s5 |
[####################] 100% 0:00:24 | consens calling      | s5 |
In [11]:
subsamples = list(s3.data.samples.values())
subsamples.sort(key=lambda x: x.stats.clusters_hidepth, reverse=True)
jobs = {}
In [8]:
from ipyrad.assemble.jointestimate import *
optim(data, sample)
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-8-9ad4f842e7b0> in <module>()
      1 from ipyrad.assemble.jointestimate import *
----> 2 optim(data, sample)

~/Documents/ipyrad/ipyrad/assemble/jointestimate.py in optim(data, sample)
    278     try:
    279         ## get array of all clusters data
--> 280         stacked = stackarray(data, sample)
    281 
    282         ## get base frequencies

~/Documents/ipyrad/ipyrad/assemble/jointestimate.py in stackarray(data, sample)
    254                     dtype=np.uint64).T
    255 
--> 256                 stacked[nclust, :catg.shape[0], :] = catg
    257                 nclust += 1
    258 

IndexError: index 499 is out of bounds for axis 0 with size 499
In [9]:
sample, _, _, nhidepth, maxlen = recal_hidepth(data, sample)
Out[9]:
(<ipyrad.core.sample.Sample at 0x7ff440145898>, 499, 495, 499, 495)
In [6]:
data
concat_multiple_edits(data, sample)
merge_pairs_with_vsearch(data, sample, True)
merge_end_to_end(data, sample, True, True)
In [7]:
dereplicate(data, sample, 2)
In [8]:
cluster(data, sample, 2, 1)
In [9]:
build_clusters(data, sample, 5)
In [10]:
muscle_chunker(data, sample)
In [14]:
for idx in range(10):
    handle = os.path.join(s3.data.tmpdir, 
        "{}_chunk_{}.ali".format(sample.name, idx))
    align_and_parse(handle, 5, 0)
In [15]:
reconcat(data, sample)
In [ ]:
 
In [6]:
s3.run()
[####################] 100% 0:00:01 | concatenating        | s3 |
[####################] 100% 0:00:01 | join unmerged pairs  | s3 |
[####################] 100% 0:00:00 | dereplicating        | s3 |
[####################] 100% 0:00:00 | splitting dereps     | s3 |
[####################] 100% 0:00:02 | mapping reads        | s3 |
[####################] 100% 0:00:37 | building clusters    | s3 |
In [6]:
 
In [9]:
maxlens, depths = get_quick_depths(data, sample)
maxlens, depths
Out[9]:
(array([460, 475, 470, 482, 468, 463, 488, 445, 474, 490, 446, 469, 482,
        457, 457, 470, 477, 445, 442, 468, 486, 447, 473, 460, 484, 467,
        482, 442, 446, 447, 458, 471, 455, 476, 480, 443, 461, 457, 479,
        452, 444, 464, 473, 477, 448, 446, 453, 488, 490, 482, 471, 483,
        480, 450, 456, 482, 454, 462, 457, 456, 455, 463, 447, 488, 457,
        446, 472, 450, 472, 482, 472, 451, 491, 479, 469, 446, 489, 486,
        463, 463, 489, 476, 458, 484, 463, 458, 443, 479, 488, 469, 470,
        474, 454, 447, 447, 489, 451, 449, 453, 491, 484, 471, 483, 481,
        483, 472, 468, 457, 464, 446, 490, 462, 452, 461, 457, 474, 469,
        481, 458, 461, 446, 472, 455, 455, 488, 471, 482, 489, 466, 461,
        473, 477, 466, 488, 461, 487, 453, 466, 450, 472, 458, 448, 490,
        484, 476, 445, 488, 451, 484, 486, 463, 469, 452, 462, 453, 476,
        448, 445, 468, 453, 463, 469, 487, 445, 453, 469, 447, 444, 464,
        488, 468, 448, 461, 465, 483, 446, 447, 473, 448, 448, 444, 481,
        464, 474, 464, 459, 461, 458, 451, 449, 468, 445, 491, 443, 448,
        475, 450, 454, 454, 467, 478, 452, 451, 468, 480, 468, 464, 484,
        462, 484, 475, 457, 463, 453, 473, 457, 486, 469, 485, 462, 464,
        442, 475, 473, 470, 482, 444, 456, 479, 472, 477, 446, 443, 471,
        473, 489, 467, 463, 476, 481, 448, 450, 460, 452, 458, 479, 444,
        457, 489, 488, 444, 453, 472, 461, 481, 451, 450, 491, 473, 480,
        491, 489, 468, 458, 472, 460, 450, 462, 447, 457, 465, 452, 468,
        475, 465, 491, 447, 453, 448, 471, 490, 472, 446, 450, 478, 468,
        455, 470, 454, 464, 479, 472, 456, 475, 479, 446, 474, 470, 474,
        446, 491, 479, 470, 457, 457, 482, 453, 468, 457, 454, 453, 445,
        489, 477, 451, 478, 485, 456, 485, 449, 485, 478, 468, 462, 454,
        486, 482, 471, 488, 444, 485, 450, 485, 449, 444, 453, 482, 484,
        472, 456, 474, 461, 487, 468, 482, 480, 459, 443, 489, 447, 444,
        453, 447, 470, 486, 442, 442, 487, 445, 471, 480, 475, 450, 456,
        473, 468, 455, 452, 491, 470, 479, 469, 469, 442, 489, 451, 476,
        464, 477, 487, 484, 468, 454, 462, 460, 476, 442, 476, 489, 481,
        477, 485, 456, 453, 476, 471, 473, 453, 480, 490, 479, 488, 470,
        483, 483, 450, 451, 459, 469, 456, 461, 458, 451, 450, 446, 482,
        446, 448, 472, 489, 485, 489, 449, 477, 488, 452, 460, 450, 471,
        444, 452, 456, 452, 467, 455, 489, 481, 474, 463, 456, 475, 453,
        445, 487, 481, 445, 476, 476, 483, 456, 455, 444, 443, 452, 478,
        488, 452, 471, 452, 483, 484, 490, 457, 487, 464, 455, 480, 491,
        475, 449, 489, 447, 468, 468, 485, 475, 484, 455, 444, 450, 466,
        448, 442, 459, 488, 482, 455, 474, 480, 482, 474, 484, 488, 469,
        475, 481, 482, 446, 473]),
 array([17, 14, 17, 20, 25, 21, 18, 25, 21, 24, 22, 23, 21, 22, 23, 20, 17,
        22, 19, 20, 25, 26, 17, 21, 19, 25, 19, 21, 20, 16, 21, 18, 22, 22,
        20, 21, 22, 23, 20, 19, 19, 17, 16, 27, 21, 20, 17, 19, 20, 18, 15,
        20, 24, 20, 19, 19, 20, 23, 20, 23, 20, 19, 20, 23, 22, 25, 17, 27,
        22, 26, 22, 19, 21, 18, 21, 20, 19, 20, 25, 19, 19, 16, 23, 19, 17,
        18, 18, 21, 16, 19, 26, 22, 22, 23, 18, 22, 21, 24, 18, 23, 19, 18,
        20, 18, 21, 22, 21, 22, 19, 19, 18, 20, 17, 19, 22, 25, 20, 21, 19,
        21, 19, 20, 24, 19, 22, 22, 20, 19, 21, 20, 17, 21, 19, 19, 22, 19,
        19, 16, 19, 22, 18, 19, 15, 18, 18, 20, 20, 25, 17, 21, 19, 18, 18,
        22, 21, 21, 19, 21, 14, 25, 19, 17, 20, 21, 14, 20, 21, 20, 18, 25,
        27, 16, 19, 18, 21, 22, 15, 18, 19, 21, 21, 19, 17, 20, 19, 20, 22,
        24, 18, 18, 18, 25, 19, 19, 19, 17, 26, 18, 20, 22, 20, 22, 22, 21,
        21, 21, 21, 18, 18, 19, 19, 19, 25, 17, 22, 21, 17, 18, 18, 18, 20,
        22, 17, 19, 19, 20, 25, 19, 25, 18, 17, 25, 15, 19, 19, 23, 19, 22,
        23, 17, 15, 22, 16, 20, 24, 23, 19, 18, 24, 19, 20, 20, 19, 19, 18,
        26, 18, 18, 23, 20, 18, 21, 22, 25, 23, 16, 19, 25, 24, 22, 21, 20,
        20, 18, 25, 17, 18, 14, 17, 17, 18, 19, 17, 19, 12, 15, 24, 16, 19,
        15, 18, 20, 21, 17, 19, 21, 26, 19, 22, 19, 19, 21, 29, 19, 22, 21,
        22, 19, 23, 17, 25, 20, 21, 20, 23, 18, 21, 19, 27, 14, 18, 16, 19,
        21, 17, 25, 17, 19, 21, 20, 23, 22, 18, 24, 20, 20, 21, 25, 15, 18,
        17, 21, 27, 20, 24, 23, 21, 20, 26, 20, 23, 20, 21, 23, 21, 19, 18,
        21, 19, 21, 15, 22, 22, 23, 21, 19, 20, 14, 21, 25, 18, 20, 13, 17,
        26, 23, 19, 18, 14, 22, 25, 21, 17, 23, 23, 17, 20, 19, 22, 20, 21,
        18, 17, 17, 19, 26, 19, 18, 21, 20, 19, 22, 15, 20, 14, 19, 22, 23,
        20, 17, 23, 16, 17, 18, 18, 20, 16, 16, 14, 22, 19, 19, 22, 16, 22,
        24, 18, 21, 22, 18, 22, 20, 25, 17, 16, 25, 27, 20, 22, 22, 23, 21,
        21, 23, 15, 17, 22, 21, 23, 17, 16, 26, 20, 17, 22, 24, 21, 21, 24,
        21, 20, 21, 21, 21, 19, 24, 18, 22, 23, 21, 15, 18, 21, 27, 14, 17,
        21, 23, 26, 23, 25, 22, 20, 20, 18, 19, 20, 21, 25, 13, 21, 17, 19,
        18, 21, 19, 17, 21, 19]))
In [ ]:
 
In [7]:
data = s3.data
samples = list(s3.data.samples.values())
sample = samples[1]
In [8]:
sample = samples[1]
In [9]:
concat_multiple_edits(data, sample)
In [10]:
merge_end_to_end(data, sample, False)
In [11]:
dereplicate(data, sample, 2)
In [12]:
split_endtoend_reads(data, sample)
In [13]:
mapping_reads(data, sample, 2)
In [14]:
build_ref_cigars(data, sample)
In [ ]:
 
In [ ]:
#samtools mpileup -uf ref.fa aln1.bam aln2.bam | bcftools view -bvcg - > var.raw.bcf
! samtools mpileup -ur /home/deren/Dropbox/opbox/Maud/lgeorge.genome.fa 
In [9]:
#regions = bedtools_merge(data, sample).strip().split("\n")
fullregions = bedtools_merge(data, sample).strip().split("\n")
In [10]:
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
In [ ]:

In [14]:
def get_ref_region(reference, contig, rstart, rend):
    "returns the reference sequence over a given region"
    cmd = [
        ip.bins.samtools, 'faidx', 
        reference,
        "{}:{}-{}".format(contig, rstart + 1, rend),
        ]
    stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()
    name, seq = stdout.decode().split("\n", 1)
    listseq = [name, seq.replace("\n", "")]
    return listseq
In [527]:
def build_ref_cigars(data, sample):
    
    # get all regions with reads. Generator to yield (str, int, int)
    #fullregions = bedtools_merge(data, sample).strip().split("\n")    
    regions = (i.split("\t") for i in fullregions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    samfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')

    # iterate over all regions
    out = open("test.clustS", 'w')
    idx = 0
    clusters = []
    for reg in regions:
        ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
        reads = samfile.fetch(*reg)

        # match paired reads together in a dictionary
        rdict = {}
        for read in reads:
            if read.qname not in rdict:
                rdict[read.qname] = [read, None]
            else:
                rdict[read.qname][1] = read

        # sort keys by derep number
        keys = sorted(
            rdict.keys(),
            key=lambda x: int(x.split("=")[-1]), reverse=True)

        # build the cluster based on map positions, orientation, cigar
        clust = []
        for key in keys:
            r1, r2 = rdict[key]
            if r1 and r2:

                aref = np.array(list(ref[1]))
                arr1 = np.zeros(aref.size, dtype="U1")
                arr2 = np.zeros(aref.size, dtype="U1")
                arr1.fill("-")
                arr2.fill("-")

                try:
                    # how far ahead of the start does this read begin
                    seq = cigared(r1.seq, r1.cigar)
                    start = r1.reference_start - reg[1] 
                    arr1[start:start + len(seq)] = list(seq)

                    seq = cigared(r2.seq, r2.cigar)
                    start = r2.reference_start - reg[1] 
                    arr2[start:start + len(seq)] = list(seq)

                    arr3 = join_arrays(arr1, arr2)
                    pairseq = "".join(arr3)
                    derep = r1.qname.split("=")[-1]
                    clust.append("{}\n{}".format("{}:{}-{};size={}"
                                                 .format(*reg, derep), pairseq))
                except ValueError:
                    print(reg)
        clusters.append("\n".join(clust))
        idx += 1
        if not idx % 1000:
            out.write("\n//\n//\n".join(clusters))
    out.close()
In [549]:
def build_ref_cigars(data, sample):
    
    # get all regions with reads. Generator to yield (str, int, int)
    #fullregions = bedtools_merge(data, sample).strip().split("\n")    
    regions = (i.split("\t") for i in fullregions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    samfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')

    # iterate over all regions
    opath = os.path.join(
        data.dirs.refmapping, "{}.clustS.gz".format(sample.name))
    out = gzip.open(opath, 'w')
    idx = 0
    clusters = []
    for reg in regions:
        ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
        reads = samfile.fetch(*reg)

        # match paired reads together in a dictionary
        rdict = {}
        for read in reads:
            if read.qname not in rdict:
                rdict[read.qname] = [read, None]
            else:
                rdict[read.qname][1] = read

        # sort keys by derep number
        keys = sorted(
            rdict.keys(),
            key=lambda x: int(x.split("=")[-1]), reverse=True)

        # build the cluster based on map positions, orientation, cigar
        clust = []
        for key in keys:
            r1, r2 = rdict[key]
            if r1 and r2:

                aref = np.array(list(ref[1]))
                arr1 = np.zeros(aref.size, dtype="U1")
                arr2 = np.zeros(aref.size, dtype="U1")
                arr1.fill("-")
                arr2.fill("-")

                # how far ahead of the start does this read begin
                seq = cigared(r1.seq, r1.cigar)
                start = r1.reference_start - reg[1] 
                arr1[start:start + len(seq)] = list(seq)
                
                seq = cigared(r2.seq, r2.cigar)
                start = r2.reference_start - reg[1] 
                arr2[start:start + len(seq)] = list(seq)
                
                arr3 = join_arrays(arr1, arr2)
                pairseq = "".join(arr3)

                derep = r1.qname.split("=")[-1]
                rname = "{}:{}-{};size={}".format(*reg, derep)
                clust.append("{}\n{}".format(rname, pairseq))
        clusters.append("\n".join(clust))
        idx += 1
        if not idx % 100:
            out.write("\n//\n//\n".join(clusters).encode())
    out.close()
In [550]:
c = build_ref_cigars(data, sample)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-550-1567c51d5c9b> in <module>()
----> 1 c = build_ref_cigars(data, sample)

<ipython-input-549-0d680f391a25> in build_ref_cigars(data, sample)
     20     clusters = []
     21     for reg in regions:
---> 22         ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
     23         reads = samfile.fetch(*reg)
     24 

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in get_ref_region(reference, contig, rstart, rend)
   2150     """
   2151     inp = os.path.join(data.tmpdir, "{}_derep.fastq".format(sample.name))
-> 2152     out1 = os.path.join(data.tmpdir, "{}_R1-tmp.fastq".format(sample.name))
   2153     out2 = os.path.join(data.tmpdir, "{}_R2-tmp.fastq".format(sample.name))
   2154 

~/miniconda3/lib/python3.6/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
    707                                 c2pread, c2pwrite,
    708                                 errread, errwrite,
--> 709                                 restore_signals, start_new_session)
    710         except:
    711             # Cleanup if the child failed starting.

~/miniconda3/lib/python3.6/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
   1296                 errpipe_data = bytearray()
   1297                 while True:
-> 1298                     part = os.read(errpipe_read, 50000)
   1299                     errpipe_data += part
   1300                     if not part or len(errpipe_data) > 50000:

KeyboardInterrupt: 
In [547]:
"\n//\n//\n".join(c).encode()
Out[547]:
b'Contig0:3481-3748;size=3\nCATGAGGTCCTTAGGGAGGGAATCAACGAAGTAGCCAAACACTTGCTGAATGTTGAAATGAGCCCCTACAG------------------------------------------------------------------------------------------------------------------------TTCCAGGGTCACACCAGCCAAGGCACTGTTGCAGTCACTTAACAAAGCAGTGTGATTCCCAGTTGGCCACAGAATT\n//\n//\nContig0:4544-5088;size=1\nCATGCTTGTTTAAAGTTGTGCAGTGCTCTTTTCTGGCAGGGGATTGGTCCTGCTTTGAGCAGGGGGTTGGA--------------------------------------------------------------------------------------------------------------------------------------TGGACTGGGAGCCCCTCTGTCAGCTCCCTGCTCCCCTAAGTTCCCTGTGCTGCAGTCGCCCAGCAGGCTATCAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:4544-5088;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGCTGCTCCTGCCCTCTGCCTTGGAGCTGCTCCCAGAGACTCCTGCTTGCTGTGCAGGGAGGAAAGG------------------------------------------------------------------------------------GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\nContig0:4544-5088;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGCTGCTCCTGCCCTCTGCCTTGGAGCTGCTCCCAGAGACTCCTGCTTGCTGCGCAGGGAGGAAAGG------------------------------------------------------------------------------------GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\nContig0:4544-5088;size=1\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGTGTCCCCCTCCTCCCTGCTCCTGCACCCTGCTTACCTCTTCTCCATATAGAGCAGGGAGGGGACAC-GAGGGAGAGAGACAGAGAGAGCTTGGGGCAGCAGCTGCTGTCTCAACTTCCTGATCCACTGACAAACAATGCAATT\n//\n//\nContig0:7193-7952;size=1\nAATTGATGGGACCCTGGAACATTTCTAGCTTGTACTGATTGTTTTGGGATTTTTTTGTTTGTTCTCTGGTTTCAATAGCAGTTGGGTGCCCAGATACCTGGGGGGAGCAGTTGGAGGGGGGTTCGCCTTCCTCTGCAGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:7193-7952;size=1\n------------------------------------------------------------------------------------------------------------------------------------------CATGGAGCACAGGACATTTGCAGGTTTAAACTAGTGTAAATGGTGAATCCTCTGTGACTTAAAGTCTTTAA----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAAGCAGCAAATAGATGGGGTGAGGGCAGGTGAATGGAGCGACTCCAATCAGATCATGGGTTTTAGCAGACTAATT--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:7193-7952;size=1\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAACATTGCTGTGCGATTCCTGGACCACTGCAGAATTCAGTGAAATCTAGGGGTCTTGGCTGCCTAAATACAT------------------------------------------------------------CCTAGCACCAGGGGACCTCCAGGTGCTACTGTAATATACAAGGGGATGATGATGATAGGATGCAGCACATG\n//\n//\nContig0:13327-13845;size=4\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCTTTCTCATTCACACACTTTTCATAGGGTGCTACAGTCATGCATCCTGATCCATACTTCTGTTACCC---------------------------------------------------------------------------------------------CTGCCCCCTCCCGTGGACGGACTCCGGGTGGGTGTATCGCAGGCTGGGAGCATACTGAAGGTGGACACATG\nContig0:13327-13845;size=1\nCCAGCCAGGGCTTTGTCAAGCCTGACCTTAAAAACCTCTAAGGAAGGAGATTCCACCACCTCCCTAG----------------------------------------------------------------------------------------------------------------------TGAGAACAGTCTAGATCCAACCTCTTTGGAACCCCCTTTCAGGTAGTTCAAAGTAGCTATCAAAT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n//\n//\nContig0:14393-14579;size=1\nCATGCTGGGCCAGATCCACCCCCCTGGGGAAGCCCAGGGGATTGCACTAGGGATCAGGTTTGCCCCTT---------------------------------------------TTTAAATGAGTCCTTATTTGACTCTGGGCTCACCAGGCCACCTCCAAGCTGTTCCTTGGCTGGCTCCGTAATT\n//\n//\nContig0:14907-15026;size=1\nCATGATCTTGGACACATCTGTTCACCTCTCTTCTTCCCCTCCACGCCCCACCACCCAAAAAAATCTGTGAAATGGTAGGGTTGATGCTGCTTTACCTTCTGGGGGTGCTTGTGAGAATT\n//\n//\nContig0:15377-15552;size=1\nAATTGCCCTGCTTGCTACCCCTTAACACCAGCCCTGGGTTTTACATGCAGAAAACAGGTGTTGTGGCACAGG--------------------------------TGATCTCGGCAGGCTGCTGTTGGTCCATGTGCCATGGTCCAGCAGCACTCCTGGACTTGCAAACATTCATG\n//\n//\nContig0:16736-16933;size=1\nAAATTCTCAGGGCTTTGCTATGCAGGAGATCAGACTAGATTA------------------------------------------------------------------------------------GCAGGAAAATCTGTTACCTTTGTGTTTTCCCTGGGAGAGGTGGGGGCGCCGCTGGCTCCGACTCCGGCATG\n//\n//\nContig0:19843-20061;size=1\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGGAGCAGC-----------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCAAGGATGAATAGAGATCTGGACTGGAAGCTAAATT\nContig0:19843-20061;size=1\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGG-----------------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCATGGATGACTAGAGATCTGGACTGGAAGCTAAATT\nContig0:19843-20061;size=1\nCATGTGACTTTCCCTGACGGGGAAAACCACTGTGACCTGACCAGAGGGCCAAGCCATGAAACGGGAGCAGC-----------------------------------------------------------------------CAGCCAGAAGGAGGTGCTCCAGCTCTGAGTAGAGCTCCTCCATGGATGACTAGAGATCTGGACTGGAAGCTAAATT\n//\n//\nContig0:22414-22496;size=1\nAATTCTCGATCATCACGTGCACTGGGTGTGGCCGGATTTCCGAGATGGATGGAACTGGGTGAGCCAAATGAATCCCTCCATG\n//\n//\nContig0:22928-23117;size=1\nCATGGGCCAGGATTCCAAAGGTTAACACCCCGGCAGCAGAGGTGGGGAGCTGAGAGCACTCAGCACCTCGA--------------------------------------------GGACTGCACTCTATATCCACCCTTGAATAGGAACCTACATGAAATGGAGTCGCTCTCCAGATGTTTGGGGAATT\n//\n//\nContig0:23425-23627;size=1\nCATGCATAAGGTCTCTAAATCCCACCACTGTTCTGATCTTTGTTTTGTAAAGTGCCCATGAGCGTAGCATC-------------------------------------------------------------------------------------GGAGAGGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATT-\nContig0:23425-23627;size=1\nCATGCATAAGTTCTCTAAATCCCACCACTGTTCTGATCTTTGTTTTGTAAAGTGCCCATGAGCGTAGCATC------------------------------------------------------------------------------------------GGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATTA\nContig0:23425-23627;size=1\n--------------------------------------------------------CATGAGCGTAGCATCAGGCCCAAATGACAGGGCTGATATGCCACACACTGGTAAACTCTGTCCCATTTCTCATTCACTCTGCATTCAGTGGTGTAATGTGGGAGAGGGTACGCTGTATTCTTCAGACATAACGGCTCGATTAATT-\n//\n//\nContig0:24391-24908;size=3\n--------------------------AATTATTATGTACATAGTTTCGTCCTATTCAGTGTCTACTCAGCGCTTCTCGGCTTGTCTCTTGTATTCATTAAAT-------------------------------------------------------------------------------------------------------------------CGCTCACTGCTCAGCAGTTCGATGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:24391-24908;size=1\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT----------------------------------------------------------------------TGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:24391-24908;size=1\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT------------------------------------------------------------------------------------------------------------------------------------------------CGCTCACTACTCCACAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATCTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:24391-24908;size=1\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACGTAGTTTAGTCCTATTCAATGTCTACTCAGCGCT----------------------------------------------------------------------TGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCAATGTTGCAATGTCCTGGTGAAATCGCTCGCCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:24391-24908;size=1\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT--------------------------------------------------------------------------GGGCTCCATTTGCCCTGATAGTGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCACCGTGCTC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:24391-24908;size=1\nAATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCT---------------------------------------------------------------------TTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTCCTGCAATGTCCTGGTGAAATCGCACGCCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:24391-24908;size=1\n-----------------------------------------AATTTAGTCCTATTCGGGGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTC----------------------------------------------------------------------------------------------------CGCTCACCGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:24391-24908;size=1\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGCCGTCTTTTCCTTGCCATGCAGTGCATGGTCAAATGCATCGTCTCGAAGAAGTTCACCAATCTGAGGACCAACAAAGACACCTTCCTTTGTCTTAGCTTCACTTAACCTTGGAAATT\n//\n//\nContig0:25290-25837;size=1\nAATTGATGGCAAAACATTGCCATTATGCAGCAAAACAACTTTAAGACTCGTCTTCGATGAATCAATGAACAGTCTC----------------------------------------------------TGTTGCAGGCTACTAGATCACCTTCCATGAAGAAGAATGGGACAAGATCCTTTTGATGGTCACGGAACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:25290-25837;size=1\nAATTGATGGCAAAACATTGCCATTATGCAGCAAAACAGCTTTAAGACTTGTCTTCGATGAATCAATGAACAGTCTC----------------------------------------------------TATTGCAGGCTCCAAGATCACCTTCCATGAAGAAGAATGGGACAAGATCCTTTTGATGGTCACGGAACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:25290-25837;size=1\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAATAAAGTTTGACTACATTTATTTCAGAAGCATTTTGGCTGTAGAGCAGCGAAATCAAAGCTGTGCCGATGC------------------------------------------------------------------------------------------AAGCAAATAGAGACATTTTCAGGTCTGCTGGCTGGTTAGGCTCCATAGTCCTACGCAAGGATGTGATCATG\n//\n//\nContig0:28260-28458;size=1\nCATGCGTCTGCAGAAAAGTACGGAGACTGGGCAAAGGCTGTGAGAAAGAAGAACGTTTGGGGCTTGTCCAA---------------------------------------------------ATCCAGTTCCTCTGCCTACTCTGGGACATTTATATCACAGAGTAGGCACTGTAATAAAAAGCAAGCAATGAGAATT\n//\n//\nContig0:28945-29088;size=1\nAATTCAAGAACTCAAAGGGACACTATCACCTTAAACTAGCAGGCACACACAAGAGGCAGTAGCTAGGTTGATGGGAGGATGCTCAGAGGTGTGGATTCTTTCATCGATCTAGCAGTGCCCACACTAGGGCTTAGGGTGGCATG\n//\n//\nContig0:30099-30713;size=7\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCATTTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------TGGTAGGGGCTGAAGGCTCTGGTCTGGGGTGGGCTTGATCTCTTGTACAGAAAACAGAGCTGAACCACATG\nContig0:30099-30713;size=1\nAATTAATGCTAGTCTTAATGAGAGTCTATTTCAACTCTTATGTAGCTGACAGGCTTTCTCAGGGCATGGCTGCTGT-------------------------------------------------------------------------------------------------TAGTTCCTTGAAGAATCAGGTTTTCCCTATTTTTTTTTAATCTCCTGTGTCAACTGGCGTCTTGGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:30099-30713;size=1\nAATTAATGATAGTCTTAATGAGAGTCTATTTCAACTCTTATGTAGCTGACAGGCTTTCTCAGGGCATGGCTGCTGT------------------------------------------------------------------------------------------------TTAGTTCCTTGAAGAATCAGGTTTTCCCTATTTTTTTTTAATCTCCTGTGTCAACTGGCGTCTTGGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:30099-30713;size=1\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCAATTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------TGGTAGGGGCTGAAGGCTCTGGTCTGGGGTGGGCTTGATCTCTTGTACAGAAAACAGAGCTGAACCACATG\nContig0:30099-30713;size=1\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCAACTAGTCCAGATGGCTGCATTTCCCCATACACCCACTACTCCTGTTTGGAAGCCTCTCCATCACTTCCT---------------------------------------------------------------------------------------------------------------------------------------------------TGGGGTGGGCTTGACCTCTTGTACAGAAAACAGAGCTGA--------\n//\n//\nContig0:32872-33505;size=1\nAATTCCTCTCCCTCCCTAACGTTAATCCCCCTGATATATTTATATAGAGCAAGCA---------------------------------------------------------------------------------CTCGGATCATCCTAGTAGCCCGTCTCTGAACCTGTTCCAGTTTGAATTC----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:32872-33505;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGATCCACTCAAGCAGTGCAAAGCAGCCGGAAATCCAGCCAATCAAGTTTGTATAGGCAAAATCCCTG-----------------------------------------------------------------------------------------------------------------------------------GGAAGCTGTGAAAGCAGCAGACAGCCACCTTGCCCCCAGCCTTAAAATCTGGACTGAGCTGAGTTTGGATGCAATT\n//\n//\nContig0:33785-34014;size=3\nAATTAGTGCTAGGAGTCATTTTATTTAGCACCGTCATTAATGATTTAGGAAGGTGGAGCGTTAGGAGCCCATTAAT----------------------------------------------------------------------------------ATAGGCCTTCCCAAGCTTTGGCAACCACCTATTCCCATAGGTGCTACAGGGGGCAGAGCTCCGTGTGCATG\nContig0:33785-34014;size=1\nAATTAGTGCTAGGAGTCATTTTCTTTAGCAGCGTCATTAATGATTTAGGAAGGTG------------------------------------------------------------------------------------------------------------CCTTCCCAAGCTTTGGCAACCACCTATTCCCATAGGTGCTACAGGGGGCAGAGCTCCGTGTGCATG\n//\n//\nContig0:36273-36404;size=1\nAATTGCCCCACTTGCCTCCTTGCCGGGAGGCCCTGAAACACCCCCAGGGAAAAATAAAACTCAGCGCCTGTGATATGCAGTCTTAGATATAGAGGCAGGCTGAGGAGGGGCTGGGTGTGAGGGGGAGCATG\n//\n//\nContig0:37976-38353;size=5\nAATTAAATGCTACAGTGTTGAGCTGTTTTTTGAGGAAGAGCCTGTTCCTTGAAAGCAACTGGCCTTTGCTTTTCTT----------------------------------------------------------------------------------------------------------------------------GAGGCTGTGGGATCGGCAGCTGGAGTTCCCAGGGCTGCTTGCTGAACTTGTTTATTGTCGATTTTCCCATG----------------------------------------------------------------------------------------------------------\nContig0:37976-38353;size=1\n---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTTAAGTATCCTCACACCTCTTGTCTACTGTCTGAAATGGGCCGTCTTGNTTATCACTACAAAAGTTTTTTTTTTCTCCTGCTGATAATAGCTCATCTTAATTAATT\n//\n//\nContig0:39644-39906;size=1\nAATTTGGACATCACGATTCTCAGGCGGCAGGTTCATGCCATGGAACACCGATTCTGGGCCTTGGAAAC---------------------------------------------------------------------------------------------------------------------------GTGCAAGAATACCAAGATGAGAGCAGCCCTCACAGTTGAGAAGCAAGTGACGATAGCCCTGTGGAGGCATG\n//\n//\nContig0:40396-40633;size=11\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATGGTGGTA------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\nContig0:40396-40633;size=1\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGGCTGAGCAAGTGCAGAATGGTGGTA------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\nContig0:40396-40633;size=1\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATGGTG------------------------------------------------------------------------------------------------------------------------GAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\nContig0:40396-40633;size=1\nCATGCACAGGCAGCTTGGACAGTAGTCAGGAGCTGTTCAACTATAGACTGAGCAAGTGCAGAATG------------------------------------------------------------------------------------------------GCTTGTTGTGTGCTCCACAATATCTGTGAGACTAAGGGGGAGATGTTTATGGCGGAGTGGGAGATTGAGGCAAATT\n//\n//\nContig0:41620-42163;size=14\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG------------------------------------------------------------------------------------------------CATCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\nContig0:41620-42163;size=1\nCATGGCATGTGTGCTTTCTTTACAAGATCGCATTTTGCCTCTTATATTGAGGGCCTGCTGGTTTGGCGTGAGAGATCACACACGCAGGGCTGGTGGGCAACAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:41620-42163;size=1\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG--------------------------------------------------------------------------------------------------TCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\nContig0:41620-42163;size=1\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTC----------------------------------------------------------------------------------------------------TCCCCATACACATTAATAGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\nContig0:41620-42163;size=1\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTGGCTAACAATAGGGATGATTTCTGTTCAGCCAAAGGTAAACAGCCCAGCAGGAACGGCCATCTCTG------------------------------------------------------------------------------------------------CATCCCCATACACATTAATGGACTTTTCCAGTAGCTGTACTGTCTGCCAATGCATCCCAAGTCTTCAGGGCAAATT\n//\n//\nContig0:42764-42845;size=1\nCATGGTCACCTGTGCTGATGAGCTCTGCATGGTCACCTGTGCTGATCAGCTTGCCACGCTGGTCAAACAGGAAATCAAATT\n//\n//\nContig0:44568-44784;size=1\nAATTTCCACAGGGCGGGAAAACCATTTTGTGCCCAGCACCCATTCAGGCAGCTGGTCACTGGGGTTGTCTGTGATG---------------------------------------------------------------------CCCTTTGATGACCATCCCCGCTGCTTAAGACCCAAAAGCAGAGCCTCGACCGCTGATGTTTCATGCACATG\n//\n//\nContig0:48224-48506;size=2\n-------------------------------------------------------------------CATGTCCAAATAATGTTGGACAACATAAGAAGGGCATTTTATATAAACAAGCAGGGGGGAGTGAGATCTCT--------------------------------------------------------------------ATATTTACCAGGGTTGCAAAACTCCCTAGCAGAGAGCCTGAGTGGATGTTTTTCCACAAATAGGAGATTCACAATT\nContig0:48224-48506;size=1\nTAATTTACTAGAACTCCAGGTGGTTCACTGAGCCTGCATCTCCTTCCCCCTGCTCATCCAATCCAGACATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n//\n//\nContig0:49781-50026;size=1\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTAGTGGTCTTGA--------------------------------------------------------------------------------------------------GGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATGCACTTTGCAGCAGCACCCATG\nContig0:49781-50026;size=1\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTT--------------------------------------------------------------------------------------------------------------CGGCTTCGTGTTGAAGCACTCGGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATG---------------------\nContig0:49781-50026;size=1\nAATTTGCTTTTGCGGGCCAGTCCCATCTAAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTA------------------------------------------------------------------------------------------------------------------CGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATGCACTTTGCAGCAGCACCCATG\nContig0:49781-50026;size=1\nAATTTGCTTTTGCGGGCCAGTCCCATCTGAGGGCTACTCAGTTCCTCTTATGTGGAAGGGGAAGTAGTGGTCTTGA-----------------------------------------------------------------------------CGGCTTCGTGTTGAAGCACTCGGACACCGTACTGATGTGGGAGAGAAAAGAGAGCAAGCTTAGATTGCATG---------------------\n//\n//\nContig0:51214-51341;size=1\nCATGCCAGGTCCCACTGGCGGAGGCAGGGTCACGCCCAGGCCATCCCTGGAAGTACCAGAATGTCCAGTATTCTGGGGATGCATTAGGGGTCACTTTACTCAAAAGTGGCAAAGCCTCTAGCGAATT\n//\n//\nContig0:53474-53686;size=1\nCATGTTAACCAGGCCGGGTGCCCCAGGCCTCAGGCATCCTGCCTCTTAACCAGGCCCAGATGCTCTTCTTT--------------------------------------------------------------------------------------------------AACTCGCTGTGTGACTTTGATCAAGTCACTTTGCCTTTCCATG\nContig0:53474-53686;size=1\nCATGTTAACCAGGCCGGGTGCCCCAGGCCTCAGGCATCCTGC---------------------------------------------------------------------------------------------------GAACATTTGGTTTCTCTTCTGCGCCAATAACTCGCTGTGTGACTTTGATCAAGTCACTTTGCCTTTCCATG\n//\n//\nContig0:53920-54018;size=1\nCATGATATGCAGTGGAGGGGAGAGGAACCCCTGGGCCTGGCAGTTCAGAGCCCTGGCACCTCTGGGCTTGCTGCAGCAGTTACAAATGTAAAAAAATT\n//\n//\nContig0:56088-56333;size=10\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG-------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\nContig0:56088-56333;size=1\nTCATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCA-------------------------------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\nContig0:56088-56333;size=1\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG----------------------------------------------------------------------------------------------------TAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\nContig0:56088-56333;size=1\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG--------------------------------------------------------------------------------------------------AATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\nContig0:56088-56333;size=1\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGACTTAAATCAAGAGATAGCCGAGATTGGAATCTG-------------------------------------------------------------------------------------------------TAATAATCAAGAGAGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\nContig0:56088-56333;size=1\n-CATGCTTCATTAGTTGTTTGTGTCCAGCTAGCAATGGATTTAAATCAAGAGATAGCCGAGATTGGAATCTG--------------------------------------------------------------------------------------------------------------AGTTTGGGCTGTCTTTTGTTTGCAGTAATGTTTAGCAGCTGACACTGCAGCAGCTGAATAATT\n//\n//\nContig0:59313-59545;size=1\nCATGGAGGGTGGGGGCTTCTGGTTGTAACTGGGGATGATCCCTCTGACCAGTTCCATGCCACTGTGG-----------------------------------------------------------------------------------------GCTTCCCCTGGTGTCAGGGCAAGCCCCCGAGAGGAACAATGACAAAGCATCACATACCGAGGTGGAATATTTAATT\n//\n//\nContig0:62283-62553;size=3\n-AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGT----------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\nContig0:62283-62553;size=1\nAAATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAA---------------------------------------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\nContig0:62283-62553;size=1\nAAATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAAC--------------------------------------------------------------------------------------------------AGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG----------------------------------------------------\nContig0:62283-62553;size=1\n-AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGT--------------------------------------------------------------------------------------------------------------------------GAGGACAGTGCTGACCATGCACATACTAACTAGTATGTTATAATGCCCTTATTTTGGGCTTTTCCTCCATG\n//\n//\nContig0:63207-63392;size=1\nCATGTTCAATGGTTTCACTTCAACGCAAACAGCCGTGGCTTCGATTGTGAACAGAAAATGTTGTGGCACAG--------------------------------------TGCCTCGCTGGTCTCTGAGGGCCCTGAAGAGCTTGCACGCCAAAGGACCAGATGGCTGGTGGGTTAAATGGAAATT\nContig0:63207-63392;size=1\nCATGTTCAATGGTTTCACTTCAACGCAAACAGCCGCGGCTTCGATTGTGAACAGAAAATGTTGTGGCACAG--------------------------------------TGCCTCGCTGGTCTCTGAGGGCCCTGAAGAGCTTGCACGCCAAAGGACCAGATGGCTGGTGGGTTAAATGGAAATT\n//\n//\nContig0:64734-65000;size=12\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGGAGAG-----------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\nContig0:64734-65000;size=1\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGG---------------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\nContig0:64734-65000;size=1\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGTAGCCACTGCGAATGCACAAGAGAGCCCAATGTAGCCCTGGAGAG-----------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATGCCGCTCAGAGCCAATGGGGTGGCATTCATG\nContig0:64734-65000;size=1\nAATTTGCTGCTAGGCCCAGACTAAAAGCAGGGAGCCACTGCGAATGCACAAGAGAGCCCAATGT-----------------------------------------------------------------------------------------------------------------------------------CTGTTCTCTGGCTTTGAACGACTGTGTGCTTTGGTGAAATACCGCTCAGAGCCAATGGGGTGGCATTCATG\n//\n//\nContig0:65302-65475;size=1\nAATTAAAAACGAACCATAAACAACAACTGAGAAAGGATTCCAGATCGTCTTTTAAAAAAGGAAGCGGGTTCTCCGG--------------------------GAAAAATGCACAGTTGATAGGAAAGAGTGTTAGCTCCATCGTGAGCCCCTGACCCACAAGGGCTGTGCATG\n//\n//\nContig0:67691-68021;size=1\nCATGATCTTATGGTCACTCTACCATTCTCCCATGCACCTGGAAGAGAAAGACAGAAGCTGTGTTAACCGAC-------------------------------------------------------------------------------------------------TTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCACCTCCCCTCAATAATGATACTTAGCACTTGATGAATT--------------------------------------------------------------------------------------\nContig0:67691-68021;size=1\n------------------------------CATGCACCTGGAAGAGAAAGACAGAAGCTGTGTTAACCGACTGAACCCATGAGATGCTAACGAGGTCTCTC-------------------------------------------------------------------TTAGCACCTTTCGCTCAAGGGCTTCTAATGCTCCCCTCACCTCCCCTCAATAATGATACTTAGCACTTGATGAATT--------------------------------------------------------------------------------------\nContig0:67691-68021;size=1\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTAAATAAGCCTTACCTGCTGTGAAATGTCAATACTATTTTGAAGATGGGGAAACCGAGGCATG\n//\n//\nContig0:68467-69007;size=9\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:68467-69007;size=1\nAATTTTACCTGAAGTTCATTTCAGAAGGGGAAGCTAAACTGGGGCAAGTGGGTTTAAACTCATGTGACGGTTCCCACACACAAAGTTGCACCAGGGTACCCATG----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:68467-69007;size=1\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA-------------------------------------------------------------------------------------------------CCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:68467-69007;size=1\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTG-----------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTCCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:68467-69007;size=1\n----------------------------------------------------------------------------------------------------------------------------------AATTGATGCAACTTCTGTGTGTAGAGTAGGCTTCCGTGTTTTCAGATGCACATATTTGGGTCTCTGACCTGGTCCA------------------------------------------------------------------------------------------------TCCCCGGTCCCGCAGAGTCGGTGGCTTGCCCGTGGGAGGTTCCTGGTCCTGTGGATTCGGTGGCTTGCATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:68467-69007;size=1\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTGCTGCCGAATCCGCAGTACCAGCCGACCTCCCACAGGCAAGCCGTCGAAGGCTGCCTGTCTGCCGCCCTCGCAGAGACCGGCAGGGCTCCCCCTGAAGCTTGCCCTGCCCCAGGGACATG\n//\n//\nContig0:69863-70116;size=19\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGTCGGTTGTCTCATAGAATCTCAGGGTTGGAAGG----------------------------------------------------------------------------------------------------------TGGGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\nContig0:69863-70116;size=1\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGTTGGTTGTCTCATAGAATCTCAGGGTTGGAAGG------------------------------------------------------------------------------------------------------------GGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\nContig0:69863-70116;size=1\nAATTTGCTGGGTCCCTGGCCGGGGCAGGCCTGTAACTCAGCAGCCGGTTGTCTCATAGAATCTCA-----------------------------------------------------------------------------------------------------------------------GGTTTAGCAGGCTAATGCTCAGATCACTGAGCTATCCCTCCCCCCCACTTCACTACCCACTAGGCCATG\n//\n//\nContig0:71142-71672;size=1\nAATTAGGCTTGAATAAAAACTGGGAGTGGATGGGCCATTACACAAAGTAAAACTATTTCCCCATGTTTATTTTCCC-----------------------------------------------------------------------------CCTGCTGGTAATAGCTCACCTTAACTGATCACTCTCGTTAGAGTGTGGATGGTAACACCCATTGTTTCATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:71142-71672;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGTTAGTCTCTAAGGTGCCACAAGGACTCCTGTTCTTTTTGAGGATACAGACTAGCATGGCTGCTACTCTGA------------------------------------------------------GTGCTGAGAGTTCGATGACTCTGGTAGCTGGGGGGATGTTCACTGGGACCCCCCCGCTCCCAGGAGACATG\n//\n//\nContig0:73757-73975;size=1\nCATGGTCTGTTTGCTACCTTGGGGCTGGCCTCGCTGGATGGT---------------------------------------------------------------------------------------------------------GGTGCTCCAGAGACTGGTGCTGAGGATGCAGCAGGGGGTGCTCTCCCTTCTGCATCAGAACGTATCCCATG\n//\n//\nContig0:74377-74795;size=1\nAATTGGTATAAGAATCCAAGGATGGTAACAAGAGCCGAAAATGCCACCCAAGGACTTCGATTCACTGACCTTGTGG-------------GCTGGAAAATGGCAGACACACTTGTTCTTTCTTGTTAGCCTCTGGAGACGCTGATGGGGCCTGAGGACATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:74377-74795;size=1\n----------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCAGGTCAAGGAGTTAGCACCGGACTGGCAGCCCATGACTGTCCCCTGATACCATCACAGGGCTGG----------------------------------------------------------------------------------------------------------------------TGGGCACCCACACAGATCTGCTGATGCCCGTCACTCCTCACCGACGGCCACCTCGAGGGCAGATTAACATG\n//\n//\nContig0:75032-75494;size=2\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG---------------------------------------------------------------------------------AAAGCCCCTTGCTCCTGCTACCCAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:75032-75494;size=1\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------------------------------------CCCCTGCCACATTCAGAATCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:75032-75494;size=1\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG-------------------------------------------------------------------------------------CCCCTTGCTCCTGCTACCCAACCGCCTGCCCCCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:75032-75494;size=1\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------------------------------------CCCCTGCCCCATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:75032-75494;size=1\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG-------------------------------------------------------------------------------------------------------------CCTGCCTCCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:75032-75494;size=1\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG--------------------------------------------------------------------------------------CCCTTGCTCCTGCTACACAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:75032-75494;size=1\nCATGCCCCCACAGGGGGTGTGTGCCCCCGCCGCCCTGGTTGAGAATCTCTGTTTAGAGGGTCAGAGCCCAG---------------------------------------------------------------------------------AAAGCCCCTTGCTCCTGCTCCCCAATCGCCTGCCACCCCTGCCACATTCAGAGTCAGATCCAACTTCCCCCGAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:75032-75494;size=1\n--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGAGATTGAGGGTTTTGAATATACAACTTCAGACTGTTAAGTGGGGTTATTGTAGCTTCCCTGCTCAATAGCTTCATACAGAATT\n//\n//\nContig0:76480-77015;size=6\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTAACCAGTT------------------------------------------------------------------------------------ATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\nContig0:76480-77015;size=4\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATCGGCCATCTATCTACAATATGTATCTAAGC------------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:76480-77015;size=2\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATC-GCCATCTATCTACAATATGTATCTAAGCA-----------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCAGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:76480-77015;size=1\nCATGTCTTATCTGGATCAGTCAACAGCATCTCTGCCTTTATC-GCCATCTATCTACAATATGTATCTAAGCA-----------------------------------------------------------------------------------CTACACCTTCTTTTAACAGATCCCTGGGGACCCGGGACATCACTAGGCACTTTATGGTCACAGTAGGGATAGAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:76480-77015;size=1\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGAGAATCTCTATAAGCACTATAGGGCTCATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGG------------------------------------------------------------------------------------------------------------------ATATGCAGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\nContig0:76480-77015;size=1\n----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACACACAGCTGTGTCTTTCTGATTCCATCCAATAGGGGGCAGCAGAGCATATAAACATTA-------------------------------------------------------------------------------------------------AGTTTTTCTTGTTCCCCTCCAGAGGGGTCAGTAATAAAACAATGGATATCATGGGATTAGAGCTGAAATT\n//\n//\nContig0:79124-79379;size=1\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCG-------------------------------------------------------------------------------------------------------------GGTGCTGGGCAAATGGTCGAAGACTTGTTCCAACCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\nContig0:79124-79379;size=1\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCGTACCTCGA--------------------------------------------------------------------------------------------------------------TGCTGGGCAAATGGTCGAAGACTGGTTCCAACCTCTCCACGCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\nContig0:79124-79379;size=1\nCATGGACATAAGCCTTACGCCTCTCCTGGAAGTGGAGTTATGATGTCCGTGCAGTGGGGCGCTTACCTCGA---------------------------------------------------------------------------------------------------------------------------------------------CCTCTCCACTCTTGAGACCCAAACTTTGGGTCTGGTGGGAATT\n//\n//\nContig0:79959-80151;size=1\nAATTTTTAAGCAACAGGATTTACTACCCGCTGTGGGAGGTGACAGGCTGATGGCTGTCTGGGAGACCCTCTCTGTC---------------------------------------------GTAGCCTGAACAGGCAGGATCCCACTAGTTGGCCATCTCCTCCATCCTCCCGCTAGCCACAGGTAGACATG\nContig0:79959-80151;size=1\nAATTTTTAAGCAACAGGATTTACTACCCGCTGTGGGAGGTGACAGGCTGATGGCTGTCTGGGAGACCCTCTCTGTC---------------------------------------------GTAGGCTGAACAGGCAGGATCCCACTAGTTGGCCATCTCCTCCATCCTCCCGCTAGCCACAGGTAGACATG\n//\n//\nContig0:81339-81852;size=11\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGAC--------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:81339-81852;size=6\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGGAC--------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGCTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:81339-81852;size=4\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTTGATTGGCC-----------------------------------------------------------------------------------------------------------------------------GCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\nContig0:81339-81852;size=1\nCATGCGTGGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGG------------------------------------------------------------------------------------------------------TATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:81339-81852;size=1\nCATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGAT--------------------------------------------------------------------------------------------------------------------------TAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:81339-81852;size=1\n-ATGCGTAGGCTTGGAAACCCCTAGGGCACAAGCCTGGTGCCTGCAGAGCACAGGGTCGATCTCCCCGG----------------------------------------------------------------------------------------------------TTTATTACACGTTCTAGGATGAAGCCACTGTGGCAGGGAAGCTCTCCGGAAATGTAGAAGTTAAAACGTTATAATT----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:81339-81852;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGAAAAGTCAGGAAAAGAGCAGAGGCCGGACGTGGCTTCTCTCCCCTTGGCTTCACTGGGTTTT-------------------------------------------------------------------------------------------------------------------------------------GCAGGACCTCGGTCACTCTGGGCCTCATTCTGCTCTCCGGGATGCTGGTTTAGTGCTGCTGGGACTCCATG\n//\n//\nContig0:82206-82699;size=4\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGA----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\nContig0:82206-82699;size=1\nCATGCAATGCAACAAGGTCTGGAGCTCCTTAGCGAGCCTTCGAGCCACCCAGTCCCTGAAATACACCCCCTGGTCCCTTTCAGGCTGGATCCAGAATGGAAAGTGTAACACCAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:82206-82699;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTATCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGGA----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\nContig0:82206-82699;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATTTCTTCTTGAGGGAGGCAGAGG-----------------------------------------------------------------------------------------GGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\nContig0:82206-82699;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCCGCTCCTTTCCTCGCTTGATTTCTTCTTGAGGGAGGCAG-------------------------------------------------------------------------------------------TGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\nContig0:82206-82699;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCTCTCTTGATATCTTCTTGAG---------------------------------------------------------------------------------------------------TGGCTCCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATG\n//\n//\nContig0:83146-83367;size=1\nAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCAGTTTTGACGAACTGGTATTTGTTGCTG-----------------------------------------------------------------------------CCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\nContig0:83146-83367;size=1\nAATTCCATCCTGATCCTTCGAACCCAGCACGTTCCTACAACAGGTTTCAGTTTTGACGAACTGGTATTTGTTGCTG--------------------------------------------------------------------------CACCCTACCAAGCACCCGAGGGTGTCAGTGGCTGCCCTGAAATGACGGGCACTGCTTGTTGCTGCCCCATG\n//\n//\nContig0:86329-86448;size=1\nAATTTGGGGGCACTGCTTTTTGGTGCCCCCAAATCTCGGTGCCACCGCCTAGTTCACCTAGTGGTTACACCGGCCCTGAGCGTCTGGGGAGTCTTTCCTCCCGAAGGCCCAGCAGCATG\n//\n//\nContig0:95645-95887;size=10\nAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTACGTGTAAAGGGCATCCTT-----------------------------------------------------------------------------------------------AAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\nContig0:95645-95887;size=4\nAATTGAGGCAACTTTAACTGATATGTCCTGAAGTCACGAGGCCTTAATGGGCATCCTAGGTGTAAAGGGCATCCTT-----------------------------------------------------------------------------------------------AAGTGATAAGAAGACTTTTTTATATACTGTGCAAGACTAACCTGTGCAACTCATTGCTACAGGATATCATG\n//\n//\nContig0:98705-98903;size=1\nCATGCAGTGTAGCCGCTGTTTGTTGGCAGTGCGACAAAAAACTTCCACCATCAATGAGTGGCGTTTACATT---------------------------------------------------TCGCGGTAAAACTTTTGTCTTTCGGGGTGGGGTGGGGATGTTTGGGTTAACACCACAAAAGTTTTGTCATTCAATT\n//\n//\nContig0:99204-99415;size=1\nCATGTGAGGGCAGGAGAAATGCCAAATCCAATCACTGCCTAGGGACAACTCTGACCTTTACCCCGAACAGC--------------------------------------------------------------------------------------------CCAGGTTTGATCCCTCACTGGTGAGGAGTCCTGCAGAGGTAGCACATG\nContig0:99204-99415;size=1\nCATGTGAGGGCAGGAGAAATGCCAAATCCAATCACTGCCTAGGGACAACTCTGACCTTTACCCCGAACAGC----------------------------------------------------------------------------------------------------------TCACTGGTGAGGAGTCCTGCAGAGGTAGCACATG\n//\n//\nContig0:101553-101788;size=4\n----CATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGGC---------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\nContig0:101553-101788;size=3\nCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAGG----------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\nContig0:101553-101788;size=1\nCATGCATGTAACTTCATAATAGACAAGTCAGGCAACAGCCTGGATTTCACGAAGGGCTGGAGGATTTCAG-----------------------------------------------------------------------------------------CGCCCTAGGGAGCAATCCTGTATTGCTCCCAGGGAGACTGACTGGGTGATCCAACAGTGCTTTTCTGTCTCTAATT\n//\n//\nContig0:102134-102336;size=1\nAATTTGGCCTGTTCTCCTTGCACTAGAAGTTGATATTTCCCTGCACATACGCTGCTCTTGCACGGACTCTTCTGTT-------------------------------------------------------CTCTCCTCCTTGCTCTAACACTCAAGTAAAAAATCGCCATCCCACATTATACTCCCCAATCATCCCGCATG\n//\n//\nContig0:103469-103709;size=16\nCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGC---------------------------------------------------------------------------------------------CCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCAGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\nContig0:103469-103709;size=1\nCATGGTCTACTAATCACAGAGATGCCACTGAGCTGGCCCTAGAACCAGGTGCAGCGTACGCAACTCTGTGC---------------------------------------------------------------------------------------------CCCCTGGTTTGTCCCCAGGCAGGAGGCGAGTAGAGGCCTCCTGGTATCAGTTTCTATCTGTCGTGTTTAACAAATT\n//\n//\nContig0:104258-104379;size=1\nAATTTTGAAAGATGTTACTATTTATTTAGTTTTTGGCTCGTTGTCGTTCCTGATTGACTTTGTTTTGCTGGAGCAGTTGGGAGGGATTATTTCACTCTTCTTGGCAGCTTCCACATACATG\n//\n//\nContig0:104739-105191;size=1\nAATTACCAAGGGTTCCAGGTGGGCTTGTACCACTCGTGCTTCAGCCCGTGCTGTTATAGCTTCGCTGCTTCTGTAC----------------------------------------------------------------------------------------------------------CTTAATGTGCTAGCGAAAGGGTTGATGAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:104739-105191;size=1\nAATTACCAAGGGTTCCAGGTGGGCTTGTACCACTCGTGCTTCAGCCCGTGCTGTTATAGCTTCGCTGCTTCTGTAC----------------------------------------------------------------------CGTCAACAAACCTTTAGTCTCAATCAGCGACCTCCACTTAATGTGCTAGCGAAAGGGTTGATGAATTCATG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:104739-105191;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTCATGTAACGATTAAAAAACGACAACAACAATAAACAGCACAAAAGTATATTACTTTAGTGTCCACCACAGCA------------------------------------------------------------------------------------------------AAAAAAGATACCCGGACGCCAGCAGTTTTCACCCTGTAACGGGAGAAAAGCCACTCTATGAAGTCCACATG\n//\n//\nContig0:106701-106806;size=1\nCATGGCCTTGGTGAGAGAGCTCTGTTCCAGAATCTGAGAGTCTCTTGTCACCACACACTTCTTCAGCTCCAGTATTTAGAGCCCCGGGCTGGGATTCACAAAATT\n//\n//\nContig0:107637-108115;size=13\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAA------------------------------------------------------------------------------------------------AGGGGTAGAGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGGCAGACTGAAAAATT\nContig0:107637-108115;size=1\nCATGTCCACTGAGGGTAGGACAAGAAGTAACGGGCTTAATC----------------------------------------------------------------------------------GGAGGTTGTGGAATCCCCGTCACAGGAGGTTTTGAAGAACAAGTCAGACAAACGCCTGTCAGGGATGGTCTAAATT---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:107637-108115;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGACCTCACAGTCCCTTTCAGCCTGACACTTCTACGATTCTATGATGCACTGGAAGAGAACGGTCAGAA--------------------------------------------------------------------------------------------------------AGCTTTTCTAGTTTGGCTGTGCCAGCAAACTTTCCCAGTTAGAGCTGCAAGAGTCAGACTGAAAAATT\n//\n//\nContig0:110181-110429;size=3\nAATTGCATTTGTAAACCATTTCACGACTTTGATGCACGATTTGCATGCATGATTTATCCCGGTCATATGAGATGAT-----------------------------------------------------------------------------------------------------ACAACAAAAAACAGCACCCCGAGCTCAGTGCCCCCCAACCTGGCAGCCCAGGTGGTTGCCTGGGTCACATG\n//\n//\nContig0:111104-111195;size=1\nCATGCAAGCAGACCAGAGCTGCCCCAAAGCACTGACGGTGGCCCCTTGACCTGTTTGAAATGCTACATTTGCCCCAAGCTGCAAGTGAATT\n//\n//\nContig0:111481-111551;size=1\nAATTAAAAATAGGCTGCGGCAAGAATGGATCCTCTGAACAGCGAAGGGAGGTACGTGGCAGGCTCCATGC\n//\n//\nContig0:112880-113019;size=1\nCATGCTGCTGCTTCCAGGAGCTGCCTGAGGTAAGCGCTGCCCAGAGCCTGCAACTCGGCTATCTCAAAAACTGACTGCGACAGTGAAAAATGAACAGTTCAAGGCTGAGAGCATCTGGTGATAAACTAACAAAAAAATT\n//\n//\nContig0:113494-113752;size=1\nAATTTCCATACCCAGATGTGTCCACGCCTGGACTCTACAGCATGCCGCAATCACATCTTGATAGCTGTGTCTACAC---------------------------------------------------------------------------------------------------------------ACAGGATTGTACTTGGGTGCTCAGCAACCCCAACCAGGGGCGGCTCTAGACATTTTGCCACCCCAAGCATG\n//\n//\nContig0:120824-121022;size=1\nAATTAGAGCAACCTCGAGGCTGCTCTACCTTACAGTAGGAGCAGGATAGACTCCCGGATAGTTCCAGAATATGATC---------------------------------------------------CACCAAAAGCTGCTGGGCTGGGACAGAGAACTGAGGTCTGTTTCTCTCAAGAAAGAACCAGGTTTCACATG\nContig0:120824-121022;size=1\nAATTAGAGCAACCTCGAGGCTGCTCTACCTTACAGTAGGAGCAGGATAGACTCCCGGATAGTTTCAGAATATGATC---------------------------------------------------CACCAAAAGCTGCTGGGCTGGGACAGAGAACTGAGGTCTGTTTCTCTCAAGAAAGAACCAGGTTTCACATG\n//\n//\nContig0:122245-122688;size=1\nAATTCCTCTAAAGCCGGTGTTCTGGGGGACGGACTGACTCAGGGGTACGGGTAAGGGAATAGATCACAGATCACTT-------GGTCCAGGATTGTACTTCCCAAATATGACGACTATATTGTACCCTTGCCTGTGCAAAAGGAGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:122245-122688;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCATCGACTTCAAGAAACGGTCCCTCCGGGGCAAGGCAGAGGCAGGCT------------------------------------------------------------------AGTAGGATGGCCAAAGGGTTATACCCTTCAGATCTCCTTTACAGGTGCCATCTTCCGAGTCCGACTGCATG\nContig0:122245-122688;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGCATCGACTTCAAGAAACGGTCCCTCCGGGGCAAGGCAGAGGCAGGCTTGTAGGGTATGGGATGGGGGAAA-------------------------------------------AGTAGGATGGCCAAAGGGTTATACCCTTCAGATCTCCTTTACAGGTGCCATCTTCCGAGTCCGACTGCATG\n//\n//\nContig0:124322-124627;size=1\nAATTACCCACCTTTCAGAAGGCTGATTAATGACATGGCTCTGCAGCATGCACAGTAGCTCAACCTCCTGCATGAAA-----------------------------------------------------------------------CAAGCTATTCTTTCCCAGATCTGGCGTAGGAAGTGCGGCGCTTCCAGCGGCCAGGGAAGATCCCTGCCATG---------------------------------------------------------------------------------------\nContig0:124322-124627;size=1\n---------------------------------------------CATGCACAGTAGCTCAACCTCCTGCATGAAACACACTCATCTCTCTCTCTCCTGAGATTGACAGGTCACGG-----------------------------------------------------------------------------------------------------------------GGCTTTGTAGCAAACTCAGCAGAAGCGGCTTCTCACAGAGTCAATCCTGGATACAGAAGATTTGCCATAGCAAATT\n//\n//\nContig0:127068-127278;size=1\nCATGATCCTGGGAGCAGCCAGTGAAGAAAGCGAAATGCCATTATAGTGGG-----------------------------------------------------------------------------------------AGGTGGGGCGAGCTGAGGACATCTTCCCAGACCATCTCTGAGCAGCACAGAGGGTGGATCCCAGCTCCATG\n//\n//\nContig0:128188-128337;size=1\nCATGGCTTCCTCTTTTATAAGGCTCAGGTGCCTTCCCTTAAGCCCAGTCGGGCAGCAGCTAATCAGACACA--TGCCCTGGACGCCCCCCCTCCCTTCTGCTTCCTCCCTTAAAGGGGCCAGTCACCCTGTGACAGCAACATTTCAATT\n//\n//\nContig0:129266-129465;size=1\nAATTTACAAAATATGAAAGGAAACACTTCTTCACATAGCCTGTGGTACGCACTGCCCAGAGGGGCCAGAGAGTCCA------------------------------------------------------------------------------------CCTCAAGCTTCAGGACATCAACTGGTCAGGAAGGAATTT\n//\n//\nContig0:132993-133221;size=2\nAATTAATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGAT---------------------------------------------------------------------------------ATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG\nContig0:132993-133221;size=1\n----AATTCTGTTTTGTGTGTGAATAAAAAATGTGTGTGTTAAAAGAGAAATGTAAAGGCCATCACTGAACCTGATGGTC-----------------------------------------------------------------------------ATTGACGACTCTAAAATATAAGACAGGCCCGTATCAATGAGGACAAAATAGCTGTAATCAAAACCAGCATG\n//\n//\nContig0:135395-135645;size=11\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATTCTAA-------------------------------------------------------------------------------------------------------CCGGAGGTGGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\nContig0:135395-135645;size=1\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATT-----------------------------------------------------------------------------------------------------------CCGGAGGTGGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\nContig0:135395-135645;size=1\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGGTATTCTAA---------------------------------------------------------------------------------------------------------------------------------CGTAATCAGCGCTGCGGCAAGCTGCATTATCCTCCCAGAATCATG\nContig0:135395-135645;size=1\nAATTACCTGGGACTCCTGCCTGGCTGGGGAGATCTGCTAGGCAATCAGATCCCACCTTCAGCACAGGG-----------------------------------------------------------------------------------------------------------------GGAGGTCGCAAGCTTTACCCACCCCGTAATCAGCGCTGCGGCAAGCTGCATTATACTCCCAGAATCATG\n//\n//\nContig0:137333-137570;size=2\nCATGGTGTTCTATCTGCTGAGCGTGTTCTGAGGCTTTTCAACCTAAAAAGTAAGTAAAACTGGAAATG---------------------------------------------------------------------------------------------ACCTGCGTATTTTACCCTTCTCTCCCCAAAATCACGTCTGAACAAAACCCATGGAATGCTCAATACATAGTCAATT\n//\n//\nContig0:137831-137897;size=1\nCATGTTTCCTAGCAACCTTAAACTTCAAGTTGACTAAGTTTGTTGTCTGGAAACCTACTATAAATT\n//\n//\nContig0:139289-139543;size=1\nAATTGAACCTGGCTTGACTGAGTTCTAGGCTCATGCCCCAGCCTCTGCACTGTGCTCCTTCTTACCAAGAGCCATC-----------------------------------------------------------------------------------------------------------TACGGTCATGTCGTACGTAGGGCTCTGTCCTGCTCCCACTGCAGCCAGTGGCAAACTCCCGTCACTTCATG\n//\n//\nContig0:140146-140422;size=1\nAATTCATTTGAATATAAATACTGTACTTACAAGTCAGTGTGTAGAGCAGTATAAACAGCTCATTGTCTGTATGAAG---------------------------------------------------------------------------------------------------------------------------------TGAAGAACCACTGCTTTAAAGGGCAGTGTGGGAGGTTGGAAAGTTGTATGATTGTTAGGAGGGACAGAGCA\n//\n//\nContig0:141889-142575;size=3\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:141889-142575;size=1\nCATGCTCCATTCAGGTGGNCCGCGGACAGTTCCCTCTAAGACACGCGCCTGTGTGGCTGCACACGAGAGAATAAAGGACCACCCACTTAATT------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:141889-142575;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCA---------------------------------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:141889-142575;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAAGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAAGGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:141889-142575;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTGGGATCTGAAAGGCTGTGGCAGCCAGAGAAAGAGGCGACTTTCCCCAGCTCCAGGGCTGCAGCTGCTGGGG---------------------------------------------------------------------------------------------------GAGGCATGGAAGAGAGGGCACATCCATTGCATTAGAAACGCAAAACTACTGATATGAAAATATGAGTCATG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:141889-142575;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------AATTTTCAAGTGGTCTGTGGAAAAAAGAAACTGAGAACCAGTGCATTAGAGGAAGACATAGAAGAAGGTTCAGAGTTTGGAGCAGAAGACGATTCCATGCATG\n//\n//\nContig0:143165-143277;size=1\nCATGGCTGGTGGAGGATGCTCAGTAACACCTAATACCTCTGCTGGCAGATATTGTTAACCCCATCTTTAGAGAGGGAAATCTGCCCTAAAGCCTGCCACATTTTGACCAATT\n//\n//\nContig0:143580-143810;size=8\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGTCATACTTTAGACTTTG-----------------------------------------------------------------------------------CTTCCAACTCTTACCTGCTGCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\nContig0:143580-143810;size=1\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAA-------------------------------------------------------------------------------------------------------------------------GCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\nContig0:143580-143810;size=1\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGTCATACTTTAGACTTTG------------------------------------------------------------------------------------------------CCTGCTGCTAATCTTTGTTCAAACGTTGATGTCTCCAGGACGTGCCTTGGCCAACATG\nContig0:143580-143810;size=1\nAATTTTTCTTCTGGTTTGTCACAGTTTGCTAATATTCCTGCTCAGAGGGAAAAAAAAAGCCATACTTTAGACTTT------------------------------------------------------------------------------------CTTCCAACTCTTACCTGCTGCTAATCTTTGTTCAAACGTTGATGGCTCCAGGACGTGCCTTGGCCAACATG\n//\n//\nContig0:145164-145418;size=5\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG-----------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\nContig0:145164-145418;size=1\nAATTTAACGTCCATTAGACATTCTGAAGGGGAAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGAT------------------------------------------------------------------------------------------------------------------------GACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\nContig0:145164-145418;size=1\nAATTTAACGTCAATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGCGCTGGT---------------------------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCATCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\nContig0:145164-145418;size=1\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG----------------------------------------------------------------------------------------------------------------GACAAATGGTCCCAGTGCCCTCCGAGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\nContig0:145164-145418;size=1\nAATTTAACGTCCATTAGACATTCTGAAGGGGTAACGCTGAGTTAGACATTGGGTGCTGGTATTGTGATGGGGAAAG-----------------------------------------------------------------------------------------------------------CATCTGACAAATGGTCCCAGTGCCCTCCGGGAGGTGGAGAAGAATACGAATATTTAACATTTATAATCATG\n//\n//\nContig0:145642-145890;size=1\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC-----------------------------------------------------------------------------------------------------TGTCAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\nContig0:145642-145890;size=1\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACACAACGGTACGC-----------------------------------------------------------------------------------------------------TGTCAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\nContig0:145642-145890;size=1\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC--------------------------------------------------------------------------------------------------------CAAATAGCAACAGGGCTGACCCAAAACCCATGTCCAAACATCCAAGAACTTCACAAAAGTTCAACTCTCAATT\nContig0:145642-145890;size=1\nCATGGGCAGCGGACGTGTGTTGGCGTGGCTCTCTGCTGTGTACCAGGCCTTATTAACACAAAACGGTACGC-------------------------------------------------------------------------------------------------AAGGTGTCAAATAGCAACAGGGCTGACCCAAAACCCATG-----------------------------------------\n//\n//\nContig0:146117-146349;size=1\nAATTAAACACGGGGTAAGCCACTGCCATGCTCTGACCCAAAGGTGGCTGGTGTATTTTCAGTAGCGGGAATGCACG-------------------------------------------------------------------------------------CCTAGAGCCCTGGGCCAGAGCAAGTGAGGGCCCCCCCACCCCCTCCAGTCCTGCCTCCAGCCCTGTTCATG\nContig0:146117-146349;size=1\nAATTAAACACGGGGTAAGCCACTGCCATGCTCTGACCCAAAGGTGGCTGGGGTATTTTCAGTAGCGGGAATGCACG-------------------------------------------------------------------------------------CCTAGAGCCCTGGGCCAGAGCAAGTGAGGGCCCCCCCACCCCCTCCAGTCCTGCCTCCAGCCCTGTTCATG\n//\n//\nContig0:148378-148953;size=2\nCATGCTGCAGCTAGGGGGAAATGGGGGCAGGGCCGGGGGAACCTCACCCTCCCCAGCTGCGAATCCTGGGA---------CGATCCGACCCGCGGACTGGAGTTCTTTACCCACCCCCTGGCCCTTTAACAACCGGTTCTCCACGGAGGTCTAATT-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:148378-148953;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTCCTGGTTCACTTCTTCGTCTCTTATAACTTTTGTATTGAGTGCAATACATAGGGCTTGCAAAAATG---------------------------------------------------------------------TCTGGCTGTCCTGTGGAGATTTACAAACCAACCTGCTTCATTGACAGTGCAGTGGGGTTTTCTTCTGTTTCAAATT\nContig0:148378-148953;size=1\n-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGTCCTGGTTCACTTCTTCGTCTCTTATAACTTTTGTATTGAGTGCAATACATAGGGCTTGCAAAAATG------------------------------------------------------------------------------------------------------------ATTGACAGTGCAGTGGGGTTTTCTTCTGTTTCAAATT\n//\n//\nContig0:149397-149596;size=2\nAATTTGAGTTCAGCTGTATTTGCAAAGAGGTGAAACACTCCAAACCAAAATGAAACATTTCGTTTGACAGAACTGT----------------------------------------------------CTCAGCCAACCAACTGAAAAATCTGTTATTTTAACAGCTCTACCCAGGAAGCAAGAGCCAGAGCAGGCATG\n//\n//\nContig0:150178-150481;size=1\nAATTCTTCGGAAGAGCCTTTTTTTCTAACCTGTGATGCAAGCTGGTCCATTTCAAACACCAGGTATCTCCAAAACAATCCCCACAGGCAGCTACCCCCCTCCAAGATCTCCAAGCACCACTTCAAATAGCGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:150178-150481;size=1\nAATTCTTCGGAAGAGCCTTTTTTTCTAACCTGTGATGCAAGCTGG-----------------------CCAAAACAATCCCCACAGGCAGCTACCCCCCTCCAAGATCTCCAAGCACCACTTCAAATAGCGACATG-----------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:150178-150481;size=1\n------------------------------------------------------------------------------------------------------------------------------------CATGTGGAATCCTGTGCTATCAATCTGCTCCTCAAAAGAGCCCCCATGAACAAAGCAGAGATGTTCCTAGC------------------------TAGCAAATCCCAGCCTGGCCCTGGGTTCAGACAAGCTCAAACCAGCACAATACAAAGAGAAGAGCAAAGTCTAATT\n//\n//\nContig0:150767-151006;size=17\nAATTTGATACCATATTCTTCTCTATTTTCTGTCCTCTGTGCACCAAAATCCCACCAAAGCAGGATGTTTGGGGGGT--------------------------------------------------------------------------------------------GTAGATGTGTTTTGGTTGCAGGGAGGCATTCTCTGGGGACATACACATAGGAATGCCTGTATGTCTACATG\nContig0:150767-151006;size=1\nAATTTGATACCATATTCTTCTCTATTTTCTGTCCTCTGTGCACCAAAATCCCACCAAAGCAGGATGTTTGGGGGG---------------------------------------------------------------------------------------------GTAGATGTGTTTTGGTTGCAGGGAGGCATTCTCTGGGGACATACACATAGGAATGCCTGTATGTCTACATG\n//\n//\nContig0:151911-152069;size=1\nAAATTGCATCGAAGAGCACAGAGGCAGGG----------------------------------------------------------TAAACCTACGCTAATGAACACAAATATAGAAGAGAAGGATCACCCAGCAGTGTTAAGGAAGCCGCCGCATG\n//\n//\nContig0:152674-152906;size=7\nAATTTCCCAGCTTCTGGGGAGTCCTGGAGAGGAGGAAGAATCCGGATCCCTTTGCAGACGGCATTGCTGCTGCACT----------------------------------------------------------------------------------------CTCTGAAGAAGGGCCAGGGGTCAGGGGTGAAAGTATGGGCTTCGTTGTGAGGAAGTGTCCCTGCCATG\nContig0:152674-152906;size=1\nAATTTCCCAGCTTCTGGGGAGTCCTGGAGAGGAGGAAGAATCCGGATCCCTTTGCAGACGGCATTGCTGCTGCACT----------------------------------------------------------------------------------------CTCTGAAGAAGGGCAAGGGGTCAGGGGTGAAAGTATGGGCTTCGTTGTGAGGAAGTGTCCCTGCCATG\n//\n//\nContig0:153152-153312;size=1\nCATGTACGTCGCACGACTGAATCAATGGGGCTCCTCACCCACTTAAAGGGAATCGTGGGGTTCAGTGCTTT-------------AGGTTTGTTTTGCCACCTCCTGGACTGGCACCTACGAGTCTAAGGCTAAATATTATTTCTAGTAGGGCTGCCAATT\n//\n//\nContig0:153761-154003;size=17\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\nContig0:153761-154003;size=1\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATGAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\nContig0:153761-154003;size=1\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGG-------------------------------------------------------------------------------------------------------------------------------------------------TTTGATTGGCTGGAGGGGATCTAATCAGCCTGTCTGCCTTAATT\nContig0:153761-154003;size=1\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA------------------------------------------------------------------------------------------------TTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\nContig0:153761-154003;size=1\nCATGGCCTCCTCCAAACACCTTCTTTATCCTCACCACAGGACCTTCCTCCTGGTGTCTGAT-------------------------------------------------------------------------------------------------------------------------------------------TGATTAGCTGGAGGTGATCTAATCAGCCGGTGTGCTTTAATT\nContig0:153761-154003;size=1\nCATAGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA-----------------------------------------------------------------------------------------------CTTTTTAAACCCAGGTGCCCTGATTAGCCTGCTTTGATTGGCTGGAGGTGATCTAATCAGCCTGTCTGCCTTAATT\nContig0:153761-154003;size=1\nCATGGCCTCCTCCAAACACCTTCTTTATTCTCACCACAGGACCTTCCTCCTGGTGTCTGATAATGCTTGTA--------------------------------------------------------------------------------------------------TTTAAACCCAGGTGCCCTGATTACCCTGCTTTGATTGGCTGCAGGTGTTCTAATCAGCCAGTCTGCCTTAATT\n//\n//\nContig0:155102-155187;size=1\nAATTCAGTTCCCCATCACCTGGGGCCAGGACTCTTCTGGGCAGCTTCAGCAGGGACGGGAGGCCAGAGACTGTGTGTGGTCCATG\n//\n//\nContig0:157369-157580;size=1\nAATTCCTGGCTCTGCTACCGTCTAGCTGTGTGATGGTGCA-----------------------------------------------------------------------------------------------TCCTCTGATGAGAGATGCTATTTCTTAGCTCCTACATATTAGCCAAAGGCTATTTAACCCACTTTCCAGTGTAATT\n//\n//\nContig0:158525-158758;size=7\nAATTAGATCTTGTGCACAATGAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACAAGCCTCCACCACCA--------------------------------------------------------------------------------------ACCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\nContig0:158525-158758;size=3\nAATTAGATCTTGTGCACAATGAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACAAGCCTCCACCACCA---------------------------------------------------------------------------------------CCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\nContig0:158525-158758;size=1\nAATTAGATCTTGTGCACAATTAGACAGGACGAGAACCTGGGCCTTTCCAGTCCAGAGACACA----------------------------------------------------------------------------------------------------ACCCCAGCAGACACGCAGGGCTTTGCTGCAACTGCGCCCCTTGGCAGCTGACATTGCTTATGTCTAACATG\n//\n//\nContig0:160325-160508;size=1\nAATTTGCAAGGCAGGGAGCT--CAGTGTCTGCTCCAAAAATCCGCGCTCTCTGTCTCCCCGATGCTCCCTGTCACACT-----------------------------------------------------TAGCTGCCCACAATGCACCACTCCCAACAGCGCTGCAAATGTGGCCACACTT\n//\n//\nContig0:161569-161784;size=1\nAATTGAAATAGCAAGGAGGGTGCTCAGTGACGGTGGGCATGATATAAAAACCCAAACAGATCGGAAGTCAGTGGCA--------------------------------------------------------------------CCTGCATCAAAAAGAAGAGACTTCCTGAAATAGACAAGTGTCTCAGCAATGAGACAGGCTCAGAGAACATG\n//\n//\nContig0:163234-163571;size=1\nAATTCTGGCTCATTGAACTTCATTTACTCATTGCTGCAATCCCAAGCCTTCAAAAATCATG------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\nContig0:163234-163571;size=1\n-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------CATGGCTGCAGTTTAACTTTTTTGTTTTGAAAGTAAAGCAGTGATTCTCCTGCAATCTTGTGGGGCCTTAAGACAGACATCAAACATCCTGAGATTCGCAATAAGAACTCAATT\n//\n//\nContig0:165831-166037;size=1\nAATTTCCTGTCCTCCTTCTACCCAAAGCTTCCCCTCCCCACAACGTGCCTCAGGCCCAAAGAATGACCTCGGAAGC-----------------------------------------------------------CCGTGGACTGCCCCAGTGGAGGCATGACAAGATGGACCGTGCGAGTTCCCCCTTTTCTGTGCTGCCCCATG\nContig0:165831-166037;size=1\nAATTTCCTGTCCTCCTTCTACCCAAAGCTTCCCCTCCCCACAACGTGCCTCAGGCCCAAAGAATGACCTCGGAAGCATG-------------------------------------------------------------------------------------------------------------------------------\n//\n//\nContig0:169197-169391;size=1\nAATTAGCAAGGAGGTTTGTCCTCTTGCCCCCCAGGGGCATCTTTGTTGGGGCTGGTTCAGACTGATTGCATAA------------------------------------------------------CCCAGTGACCTCCAAGCACTTTACAAACCTGCCTCAGGGGCCTTACACAGGAGGAAAAGGAGACATG\nContig0:169197-169391;size=1\nAATTAGCAAGGAGGTTTGTCCTCTTGCCCCCCAGGGGCATCTTTGTTGGGGCTGGTTCAGACTGATTGCATAAGCT-----------------------------------------------TTCACCCAGTGACCTCCAAGCACTTTACAAACCTGCCTCAGGGGCCTTACACAGGAGGAAAAGGAGACATG'
In [397]:
regions = (i.split("\t") for i in fullregions)
regions = ((i, int(j), int(k)) for (i, j, k) in regions)
build_ref_cigars(data, sample)
In [438]:
reg = ('Contig0', 41920, 42479)
reg = ('Contig0', 7219468, 7220326)
reg = ('Contig0', 207998, 208219)
reg = ('Contig0', 16738, 16933)
reg = ('Contig0', 49781, 50005)
reg = ('Contig0', 76480, 76711)
reg = ('Contig0', 24391, 24908)
reg = ('Contig0', 7193, 7952)       # has merge
#reg = ('Contig0', 13327, 13845)     # has indels
reg = ('Contig0', 76480, 77015)  # has indels

reg = ('Contig0', 346131, 346193)  # valueerror

samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')
ref = get_ref_region(data.paramsdict["reference_sequence"], *reg)
reads = list(samfile.fetch(*reg))
In [519]:
def cigared(sequence, cigartups):
    start = 0
    seq = ""
    for tup in cigartups:
        flag, add = tup
        if flag is 0:
            seq += sequence[start:start + add]
        if flag is 1:
            pass
        if flag is 2:
            seq += "-" * add
            start -= add
        if flag is 4:
            pass
        start += add
    return seq    
In [456]:
# match paired reads together in a dictionary
rdict = {}
for read in reads:
    if read.qname not in rdict:
        rdict[read.qname] = [read]
    else:
        rdict[read.qname].append(read)

# sort keys by derep number
keys = sorted(
    rdict.keys(),
    key=lambda x: int(x.split("=")[-1]), reverse=True)

# build the cluster based on map positions, orientation, cigar
for key in keys:
    r1, r2 = rdict[key]
    if r1 and r2:
        print(r1.seq, r1.cigar)# r1.pos, r1.aend, r1.rlen, r1.aligned_pairs)
        print(cigared(r1.seq, r1.cigar))
CATGTTCAGTGTAGTATGAAGCGGGGTAGCACTTTCCACTGCAGAAGTAGGTCTCATTGTGTGCGTAGTGC [(4, 15), (0, 25), (1, 3), (0, 16), (1, 1), (0, 5), (1, 1), (0, 5)]
(4, 15)
15 15 
(0, 25)
40 40 ATGAAGCGGGGTAGCACTTTCCACT
(1, 3)
43 43 ATGAAGCGGGGTAGCACTTTCCACT
(0, 16)
59 59 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTG
(1, 1)
60 60 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTG
(0, 5)
65 65 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCG
(1, 1)
66 66 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCG
(0, 5)
71 71 ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCGAGTGC
ATGAAGCGGGGTAGCACTTTCCACTGAAGTAGGTCTCATTGGTGCGAGTGC
In [310]:
out = open("test2.txt", 'w')
# build the cluster based on map positions, orientation, cigar
for key in keys:
    r1, r2 = rdict[key]
    if r1 and r2:
        
        # empty arrays
        aref = np.array(list(ref[1]))
        arr1 = np.zeros(aref.size, dtype="U1")
        arr2 = np.zeros(aref.size, dtype="U1")
        arr1.fill("-")
        arr2.fill("-")

        # how far ahead of the start does this read begin
        start = r1.reference_start - reg[1] 
        seq = cigared(r1.seq, r1.cigar)
        arr1[start:start+len(seq)] = list(seq)
        
        seq = cigared(r2.seq, r2.cigar)
        start = r2.reference_start - reg[1] 
        arr2[start:start+len(seq)] = list(seq)
        #print("".join(arr1), file=out)
        #print("".join(arr2), file=out)
        
        arr3 = join_arrays(arr1, arr2)
        print("".join(arr3), file=out)
        
out.close()
In [307]:
import numba

#numba.jit(nopython=True)
def join_arrays(arr1, arr2):
    arr3 = np.zeros(arr1.size, dtype="U1")
    for i in range(arr1.size):
        
        if arr1[i] == arr2[i]:
            arr3[i] = arr1[i]
        
        elif arr1[i] == "N":
            if arr2[i] == "-":
                arr3[i] = "N"
            else:
                arr3[i] = arr2[i]
                
        elif arr2[i] == "N":
            if arr1[i] == "-":
                arr3[i] = "N"
            else:
                arr3[i] = arr1[i]
        
        elif arr1[i] == "-":
            if arr2[i] == "N":
                arr3[i] = "N"
            else:
                arr3[i] = arr2[i]
        
        elif arr2[i] == "-":
            if arr1[i] == "N":
                arr3[i] = "N"
            else:
                arr3[i] = arr1[i]
                
        else:
            arr3[i] = "N"
    return arr3
    

a1 = np.array(list("AGAGAG-NN----"))
a2 = np.array(list("----ACTTNNTTT"))
de = np.array(list("AGAGANTTNNTTT"))   

a1 = np.array(list("AGAGAG-NN-------"))
a2 = np.array(list("----ACTTNNTTT---"))
de = np.array(list("AGAGANTTNNTTT---"))
In [308]:
join_arrays(a1, a2)
Out[308]:
array(['A', 'G', 'A', 'G', 'A', 'N', 'T', 'T', 'N', 'N', 'T', 'T', 'T',
       '-', '-', '-'], dtype='<U1')
In [233]:
# how far ahead of the start does this read begin
start = r1.reference_start - reg[1] 
Out[233]:
552
In [228]:
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")
arr1.fill("-")
In [194]:
r1.reference_start, r1.pos, r1.reference_end, r1.cigar
Out[194]:
(7745, 7745, 7821, [(0, 76)])
In [202]:
cigared(r1.seq, r1.cigar)
Out[202]:
'AATTAACATTGCTGTGCGATTCCTGGACCACTGCAGAATTCAGTGAAATCTAGGGGTCTTGGCTGCCTAAATACAT'
In [199]:
# which is forward and reverse
if r1.blocks[0][0] > r2.blocks[0][0]:
    rx = r2
    r2 = r1
    r1 = rx

# get arrs
aref = np.array(list(ref[1]))
arr1 = np.zeros(aref.size, dtype="U1")
arr2 = np.zeros(aref.size, dtype="U1")

# fill arr1
seq1 = cigared(r1.seq, r1.cigar)
arr1[]
Out[199]:
array(['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '',
       '', '', '', '', '', '', '', '', '', '', ''], dtype='<U1')
In [197]:
ref = get_ref_region(data.paramsdict["reference"], *reg)

def cigar(r1, r2, reg):
    
    # which is forward and reverse
    if r1.blocks[0][0] > r2.blocks[0][0]:
        rx = r2
        r2 = r1
        r1 = rx
        
    # get arrs
    aref = np.array(list(ref[1]))
    arr1 = np.zeros(aref.size, dtype="U1")
    arr2 = np.zeros(aref.size, dtype="U1")
    
    # fill arr1
    
    
    # do they overlap?
    overlap = False
    if max(r1.blocks[0]) > min(r2.blocks[0]):
        overlap = True
        #osegment = r1.blocks[0][1] - r2.blocks[0][0]
        #oseqs = r1.seq[-osegment:], r2.seq[:osegment]
        #print(oseqs)
    
    # modify for cigar
    for edit in r1.cigar:
        r1.seq
        
    
    
    before = "-" * (r1.pos - reg[1])
    if r1.is_reverse:
        read = before + revcomp(r1.seq)
    else:
        read = before + r1.seq
        
    midns = "-" * (r2.pos - r1.aend)
    read += midns
    
    if r2.is_reverse:
        read += r2.seq
    else:
        read += revcomp(r2.seq)

    after = "-" * (reg[2] - r2.aend)
    read += after
    return read
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-197-82b00e79f3db> in <module>()
----> 1 ref = get_ref_region(data.paramsdict["reference"], *reg)
      2 
      3 def cigar(r1, r2, reg):
      4 
      5     # which is forward and reverse

KeyError: 'reference'
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [350]:
def get_ref_region(reference, contig, rstart, rend):
    cmd = [
        ip.bins.samtools, 'faidx', 
        reference, 
        "{}:{}-{}".format(contig, rstart+1, rend)]
    stdout, err = sps.Popen(cmd, stdout=sps.PIPE).communicate()
    name, seq = stdout.decode().split("\n", 1)
    listseq = [name, seq.replace("\n", "")]
    return listseq
In [250]:
ireg = samfile.fetch(*reg)
rdict = {}fo
for read in ireg:
    if read.qname not in rdict:
        rdict[read.qname] = [read]
    else:
        rdict[read.qname].append(read)

clust = dict_to_clust(rdict)
In [251]:
clust
Out[251]:
['AATTTATTATGAAACAAAAGGCAAAAAACTGTTATGTACATAGTTTAGTCCTATTGAGTGTCTACTCAGCGCTnnnnCGCTCACTGCTCCGCAGTTCAGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG',
 'AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAAnnnnGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG']
In [252]:
ref = get_ref_region(*reg)
ref
Out[252]:
['>Contig0:24392-24679',
 'AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTGTATTCATTAAATGGAGCATCTCTTGTCACTGTCCAGCAATAGTCTGCAAGCATTGATGGGCTCCATTTGCCCTGATAGCGTTTCTCCATTGTTGCAATGTCCTGGTGAAATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG']
In [255]:
print(ref[1][:80])
print(r1.seq)
AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGC
AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAA
In [272]:
# how for ahead of reference is r1 start
ahead = -1 * (reg[1] - (r1.pos - 1))
In [274]:
print(ref[1][:90])
print("-"*ahead, r1.seq)
AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTG
---------------------------------------- AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTGTATTTATTTAA
In [344]:
print('r', ref[1][:90])

for key in rdict:
    r1, r2 = rdict[key]
    
    ahead = -1 * (reg[1] - (r1.pos))
    print('h', ("-"*ahead + r1.seq)[:90])
    #print(r1.cigar)
    
print('r', ref[1][-90:])

for key in rdict:
    r1, r2 = rdict[key]
    
    ahead = (reg[2] - (r2.pos))
    
    print(reg[2], r2.pos, ahead)
    print('h', ("-"*ahead))
    #print(r1.cigar)

    #print(r2.seq)
    #print(r2.get_blocks())
    #print(r1.seq)
    #print(r1.get_blocks()[0])
    #print(r1.aend, r2.get_blocks()[0][0])
    #print(r1.cigar)
    print(r2.cigarstring)
r AATTTATTATGAAACAAAAGGCAAAAAACTATTATGTACATAGTTTAGTCCTATTCAGTGTCTACTCAGCGCTTCTTGGCTTGTCTCTTG
h AATTTATTATGAAACAAAAGGCAAAAAACTGTTATGTACATAGTTTAGTCCTATTGAGTGTCTACTCAGCGCT
h -----------------------------------------AATTTAGACCTATTCGGTGTCTACTCAGCGCTTCTTGGCTTGTCTTTTG
r AATCGCTCGCCGTGCTCGTCGCTCACTGCTCCGCAGTTCGGTGGAAAAAAATCTAGATGAGAGTGCAAAAAATGTATCTTTAGTGACATG
24679 24608 71
h -----------------------------------------------------------------------
71M
24679 24616 63
h ---------------------------------------------------------------
63M
In [66]:
for read in clust:
    print(read)
AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAAAGGTnnnnAGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG
AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAGAGGTnnnnGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG
AATTCAGAACTGCCAGTGAACCAACCCCCCATCCATTATTTGCACAACTCTACTTAAGACTTCAGGCCGCAAAGGnnnnAGCAGTATAGGGCCTATGACACTCAGTGGTGCAGTTCTGATTCCATCCAGCAGAGGACAGTGCTGACCATG
In [30]:
# access reads from bam file using pysam
samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')

# iterate over all mapped regions
clusters = []
for reg in regions:
    ireg = samfile.fetch(*reg)

    # match paired reads by read names for all reads in each cluster
    rdict = {}
    for read in ireg:
        if read.qname not in rdict:
            rdict[read.qname] = [read]
        else:
            rdict[read.qname].append(read)

    clust = dict_to_clust(rdict)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-30-b830d6272d8d> in <module>()
     18             rdict[read.qname].append(read)
     19 
---> 20     clust = dict_to_clust(rdict)

<ipython-input-27-17187aba4e6c> in dict_to_clust(rdict)
      2     clust = []
      3     for pair in rdict.values():
----> 4         r1, r2 = pair
      5         poss = r1.get_reference_positions() + r2.get_reference_positions()
      6         seedstart, seedend = min(poss), max(poss)

ValueError: not enough values to unpack (expected 2, got 1)
In [31]:
rdict
Out[31]:
{'1c64525c1f5f58dc1f713b0e5e1d0941;size=1': [<pysam.libcalignedsegment.AlignedSegment at 0x7f0a9fae5ac8>]}
In [29]:
from ipyrad.assemble.util import revcomp
In [106]:
def build_ref_clusters_from_cigars(data, sample):
    
    # get regions from bedtools overlaps
    regions = bedtools_merge(data, sample).strip().split("\n")
    regions = (i.split("\t") for i in regions)
    regions = ((i, int(j), int(k)) for (i, j, k) in regions)

    # access reads from bam file using pysam
    samfile = AlignmentFile(
        os.path.join(data.dirs.refmapping, 
        "{}-mapped-sorted.bam".format(sample.name)),
        'rb')
    
    # iterate over all mapped regions
    clusters = []
    for reg in regions:
        ireg = samfile.fetch(*reg)
    
        # match paired reads by read names for all reads in each cluster
        rdict = {}
        for read in ireg:
            if read.qname not in rdict:
                rdict[read.qname] = [read]
            else:
                rdict[read.qname].append(read)

        return dict_to_clust(rdict)
In [109]:
seq = build_ref_clusters_from_cigars(data, sample)
seq
Out[109]:
['TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',
 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',
 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',
 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG',
 'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGCnnnnTGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG']
In [27]:
def dict_to_clust(rdict):
    clust = []
    for pair in rdict.values():
        r1, r2 = pair
        poss = r1.get_reference_positions() + r2.get_reference_positions()
        seedstart, seedend = min(poss), max(poss)

        reads_overlap = False
        #print(r1.cigartuples, r1.cigarstring, r1.cigar)

        if r1.is_reverse:
            if r2.aend > r1.get_blocks()[0][0]:
                reads_overlap = True
                seq = r2.seq + 'nnnn' + revcomp(r1.seq)
            else:
                seq = r2.seq + 'nnnn' + r2.seq

        else:
            if r1.aend > r2.get_blocks()[0][0]:
                reads_overlap = True
                seq = r1.seq + 'nnnn' + revcomp(r2.seq)
            else:
                seq = r1.seq + 'nnnn' + r2.seq
        clust.append(seq)
    return clust
In [8]:
# build clusters for aligning with muscle from the sorted bam file
samfile = AlignmentFile(
    os.path.join(data.dirs.refmapping, 
    "{}-mapped-sorted.bam".format(sample.name)),
    'rb')
In [24]:
chrom, rstart, rend = regions[0].split()
reg = samfile.fetch(chrom, int(rstart), int(rend))

for read in reg:
    print(rstart)
    print(read.seq)
5009
TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGC
5009
TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCCC
5009
TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGAACCAGCTGCCCCC
5009
TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGCCCCGACCCAGCTGCCCGC
5009
TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCGC
5009
TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG
5009
TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATAGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG
5009
TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATAGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG
5009
TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCCGCTGCAGTTAACTGGCCTCTTAACCCCG
5009
TGCACTCACTTCCATGAGCGTCTGAAAAGTGACATCTGACTCGTAGGCACGGCCGCTATGGTGCGCGATCCTGCTGCAGTTAACTGGCCTCTTAACCCCG
In [ ]:
 
In [9]:
nonmerged1 = os.path.join(
    data.tmpdir, 
    "{}_nonmerged_R1_.fastq".format(sample.name))
nonmerged2 = os.path.join(
    data.tmpdir, 
    "{}_nonmerged_R2_.fastq".format(sample.name))
edits1 = os.path.join(
    data.dirs.edits,
    "{}.trimmed_R1_.fq.gz".format(sample.name))
edits2 = os.path.join(
    data.dirs.edits, 
    "{}.trimmed_R2_.fq.gz".format(sample.name))

# file precedence
nonm1 = [i for i in (edits1, nonmerged1) if os.path.exists(i)][-1]
nonm2 = [i for i in (edits2, nonmerged2) if os.path.exists(i)][-1]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-9-57cd2cd83b65> in <module>()
     13 
     14 # file precedence
---> 15 nonm1 = [i for i in (edits1, nonmerged1) if os.path.exists(i)][-1]
     16 nonm2 = [i for i in (edits2, nonmerged2) if os.path.exists(i)][-1]

IndexError: list index out of range
In [10]:
edits1
Out[10]:
'/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/3L_0.trimmed_R1_.fq.gz'
In [11]:
infiles = [
    os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
    os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
    os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
    os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]
In [15]:
split_endtoend_reads(data, sample)
In [11]:
with open(inp, 'r') as infile:
    duo = izip(*[iter(infile)] * 2)
    
    idx = 0
    while 1:
        try:
            itera = next(duo)
        except StopIteration:
            break
            
        r1, r2 = itera[1].split("nnnn")
In [14]:
def split_endtoend_reads(data, sample):
    """
    Takes R1nnnnR2 derep reads from paired data and splits it back into 
    separate R1 and R2 parts for read mapping. 
    """

    inp = os.path.join(data.tmpdir, "{}_derep.fastq".format(sample.name))
    out1 = os.path.join(data.tmpdir, "{}_R1-tmp.fastq".format(sample.name))
    out2 = os.path.join(data.tmpdir, "{}_R2-tmp.fastq".format(sample.name))

    splitderep1 = open(out1, 'w')
    splitderep2 = open(out2, 'w')

    with open(inp, 'r') as infile: 
        # Read in the infile two lines at a time: (seqname, sequence)
        duo = izip(*[iter(infile)] * 2)

        ## lists for storing results until ready to write
        split1s = []
        split2s = []

        ## iterate over input splitting, saving, and writing.
        idx = 0
        while 1:
            try:
                itera = next(duo)
            except StopIteration:
                break
            ## split the duo into separate parts and inc counter
            part1, part2 = itera[1].split("nnnn")
            idx += 1

            ## R1 needs a newline, but R2 inherits it from the original file            
            ## store parts in lists until ready to write
            split1s.append("{}{}\n".format(itera[0], part1))
            split2s.append("{}{}".format(itera[0], part2))

            ## if large enough then write to file
            if not idx % 10000:
                splitderep1.write("".join(split1s))
                splitderep2.write("".join(split2s))
                split1s = []
                split2s = [] 

    ## write final chunk if there is any
    if any(split1s):
        splitderep1.write("".join(split1s))
        splitderep2.write("".join(split2s))

    ## close handles
    splitderep1.close()
    splitderep2.close()
In [10]:
merge_pairs(data, sample, 1, 1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-15da25c95fd0> in <module>()
----> 1 mergepairs(data, sample, 1, 1)

NameError: name 'mergepairs' is not defined
In [ ]:
 
In [8]:
s3.data.samples["1A_0"].concatfiles
Out[8]:
[('/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/1A_0.trimmed_R1_.fastq.gz',
  '/home/deren/Documents/ipyrad/tests/4-refpairtest_edits/1A_0.trimmed_R2_.fastq.gz')]
In [6]:
s3.run()
[####################] 100% 0:00:00 | concatenating      | s3 |
[####################] 100% 0:00:04 | mapping            | s3 |
In [28]:
def bedtools_merge(data, sample):
    """ 
    Get all contiguous genomic regions with one or more overlapping
    reads. This is the shell command we'll eventually run

    bedtools bamtobed -i 1A_0.sorted.bam | bedtools merge [-d 100]
        -i <input_bam>  :   specifies the input file to bed'ize
        -d <int>        :   For PE set max distance between reads
    """
    LOGGER.info("Entering bedtools_merge: %s", sample.name)
    mappedreads = os.path.join(data.dirs.refmapping, 
                               sample.name + "-mapped-sorted.bam")

    ## command to call `bedtools bamtobed`, and pipe output to stdout
    ## Usage:   bedtools bamtobed [OPTIONS] -i <bam> 
    ## Usage:   bedtools merge [OPTIONS] -i <bam> 
    cmd1 = [ip.bins.bedtools, "bamtobed", "-i", mappedreads]
    cmd2 = [ip.bins.bedtools, "merge", "-i", "-"]

    ## If PE the -d flag to tell bedtools how far apart to allow mate pairs.
    ## If SE the -d flag is negative, specifying that SE reads need to
    ## overlap by at least a specific number of bp. This prevents the
    ## stairstep syndrome when a + and - read are both extending from
    ## the same cutsite. Passing a negative number to `merge -d` gets this done.
    if 'pair' in data.paramsdict["datatype"]:
        check_insert_size(data, sample)
        #cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"]))
        cmd2.insert(2, str(data._hackersonly["max_inner_mate_distance"]))
        cmd2.insert(2, "-d")
    else:
        cmd2.insert(2, str(-1 * data._hackersonly["min_SE_refmap_overlap"]))
        cmd2.insert(2, "-d")

    ## pipe output from bamtobed into merge
    LOGGER.info("stdv: bedtools merge cmds: %s %s", cmd1, cmd2)
    proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
    proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)
    result = proc2.communicate()[0].decode()
    proc1.stdout.close()

    ## check for errors and do cleanup
    if proc2.returncode:
        raise IPyradWarningExit("error in %s: %s", cmd2, result)

    ## Write the bedfile out, because it's useful sometimes.
    if os.path.exists(ip.__debugflag__):
        with open(os.path.join(data.dirs.refmapping, sample.name + ".bed"), 'w') as outfile:
            outfile.write(result)

    ## Report the number of regions we're returning
    nregions = len(result.strip().split("\n"))
    LOGGER.info("bedtools_merge: Got # regions: %s", nregions)
    return result
In [27]:
bedtools_merge
Out[27]:
<function __main__.bedtools_merge(data, sample)>
In [29]:
def check_insert_size(data, sample):
    """
    check mean insert size for this sample and update 
    hackersonly.max_inner_mate_distance if need be. This value controls how 
    far apart mate pairs can be to still be considered for bedtools merging 
    downstream.
    """

    ## pipe stats output to grep
    cmd1 = [
        ip.bins.samtools, 
        "stats", 
        os.path.join(
            data.dirs.refmapping, "{}-mapped-sorted.bam".format(sample.name)),
        ]
    cmd2 = ["grep", "SN"]
    proc1 = sps.Popen(cmd1, stderr=sps.STDOUT, stdout=sps.PIPE)
    proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc1.stdout)
    res = proc2.communicate()[0].decode()
    if proc2.returncode:
        raise IPyradWarningExit("error in %s: %s", cmd2, res)
        
    ## starting vals
    avg_insert = 0
    stdv_insert = 0
    avg_len = 0

    ## iterate over results
    for line in res.split("\n"):
        if "insert size average" in line:
            avg_insert = float(line.split(":")[-1].strip())

        elif "insert size standard deviation" in line:
            ## hack to fix sim data when stdv is 0.0. Shouldn't
            ## impact real data bcz stdv gets rounded up below
            stdv_insert = float(line.split(":")[-1].strip()) + 0.1
       
        elif "average length" in line:
            avg_len = float(line.split(":")[-1].strip())

    LOGGER.debug("avg {} stdv {} avg_len {}"
                 .format(avg_insert, stdv_insert, avg_len))

    ## If all values return successfully set the max inner mate distance.
    ## This is tricky. avg_insert is the average length of R1+R2+inner mate
    ## distance. avg_len is the average length of a read. If there are lots
    ## of reads that overlap then avg_insert will be close to but bigger than
    ## avg_len. We are looking for the right value for `bedtools merge -d`
    ## which wants to know the max distance between reads. 
    if all([avg_insert, stdv_insert, avg_len]):
        ## If 2 * the average length of a read is less than the average
        ## insert size then most reads DO NOT overlap
        if stdv_insert < 5:
            stdv_insert = 5.
        if (2 * avg_len) < avg_insert:
            hack = avg_insert + (3 * np.math.ceil(stdv_insert)) - (2 * avg_len)

        ## If it is > than the average insert size then most reads DO
        ## overlap, so we have to calculate inner mate distance a little 
        ## differently.
        else:
            hack = (avg_insert - avg_len) + (3 * np.math.ceil(stdv_insert))
            

        ## set the hackerdict value
        LOGGER.info("stdv: hacked insert size is %s", hack)
        data._hackersonly["max_inner_mate_distance"] = int(np.math.ceil(hack))

    else:
        ## If something fsck then set a relatively conservative distance
        data._hackersonly["max_inner_mate_distance"] = 300
        LOGGER.debug("inner mate distance for {} - {}".format(sample.name,\
                    data._hackersonly["max_inner_mate_distance"]))
In [33]:
#from ipyrad.assemble.refmap import *
data = s3.data
samples = list(data.samples.values())
sample = samples[0]
regions = bedtools_merge(data, sample).strip().split("\n")
In [34]:
nregions = len(regions)
chunksize = (nregions / 10) + (nregions % 10)
In [43]:
from pysam import AlignmentFile
sample.files.mapped_reads = os.path.join(
    data.dirs.refmapping, sample.name + "-mapped-sorted.bam")
samfile = AlignmentFile(sample.files.mapped_reads, 'rb')
#"./tortas_refmapping/PZ70-mapped-sorted.bam", "rb")
In [97]:
regions[1]
Out[97]:
'MT\t10109\t10200'
In [170]:
samfile = pysam.AlignmentFile("ex1.sam", "r")
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-170-d009e749899f> in <module>()
----> 1 samfile = pysam.AlignmentFile("ex1.sam", "r")

pysam/libcalignmentfile.pyx in pysam.libcalignmentfile.AlignmentFile.__cinit__()

pysam/libcalignmentfile.pyx in pysam.libcalignmentfile.AlignmentFile._open()

FileNotFoundError: [Errno 2] could not open alignment file `ex1.sam`: No such file or directory
In [262]:
a = pysam.AlignmentFile("./3-refsetest_refmapping/1A_0.sam")
In [287]:
nthreads = 2
cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, nthreads)),
    "-M",
    data.paramsdict['reference_sequence'], 
    sample.concatfiles[0][0],
    sample.concatfiles[0][1] or "",
    ] 
cmd1 = [i for i in cmd1 if i]

# Insert optional flags for bwa
bwa_args = data._hackersonly["bwa_args"].split()
bwa_args.reverse()
for arg in bwa_args:
    cmd1.insert(2, arg)

with open(sample.files.sam, 'wb') as outfile:
    proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)
    error1 = proc1.communicate()[0]
    if proc1.returncode:
        raise IPyradError(error1)
In [288]:
def bwa_map(data, sample, nthreads, force):
    """ 
    Map reads to reference sequence. This reads in the fasta files
    (samples.files.edits), and maps each read to the reference. Unmapped reads 
    are dropped right back in the de novo pipeline.  
    Mapped reads end up in a sam file.
    """

    sample.files.sam = os.path.join(
        data.dirs.refmapping, 
        "{}.sam".format(sample.name))

    sample.files.mapped_reads = os.path.join(
        data.dirs.refmapping,
        "{}-mapped-sorted.bam".format(sample.name))

    sample.files.unmapped_bam = os.path.join(
        data.dirs.refmapping,
        "{}-unmapped.bam".format(sample.name))

    sample.files.unmapped_reads = os.path.join(
        data.dirs.refmapping,
        "{}-unmapped.fastq".format(sample.name))


    # (cmd1) bwa mem [OPTIONS] <index_name> <file_name_A> [<file_name_B>] 
    #  -t #         : Number of threads
    #  -M           : Mark split alignments as secondary.

    # (cmd2) samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...] 
    #  -b = write to .bam
    #  -q = Only keep reads with mapq score >= 30 (seems to be pretty standard)
    #  -F = Select all reads that DON'T have these flags. 
    #        0x4 (segment unmapped)
    #        0x100 (Secondary alignment)
    #        0x800 (supplementary alignment)
    #  -U = Write out all reads that don't pass the -F filter 
    #       (all unmapped reads go to this file).

    # (cmd3) samtools sort [options...] [in.bam]
    #  -T = Temporary file name, this is required by samtools, ignore it
    #       Here we hack it to be samhandle.tmp cuz samtools cleans it up
    #  -O = Output file format, in this case bam
    #  -o = Output file name

    # (cmd5) samtools bam2fq -v 45 [in.bam]
    #   -v45 set the default qscore arbirtrarily high
    #
    cmd1 = [
        ip.bins.bwa, "mem",
        "-t", str(max(1, nthreads)),
        "-M",
        data.paramsdict['reference_sequence'], 
        sample.concatfiles[0][0],
        sample.concatfiles[0][1] or "",
        ] 
    cmd1 = [i for i in cmd1 if i]

    # Insert optional flags for bwa
    bwa_args = data._hackersonly["bwa_args"].split()
    bwa_args.reverse()
    for arg in bwa_args:
        cmd1.insert(2, arg)

    with open(sample.files.sam, 'wb') as outfile:
        proc1 = sps.Popen(cmd1, stderr=None, stdout=outfile)
        error1 = proc1.communicate()[0]
        if proc1.returncode:
            raise IPyradError(error1)

    # sends unmapped reads to a files and will PIPE mapped reads to cmd3
    cmd2 = [
        ip.bins.samtools, "view", 
        "-b", 
        "-F", "0x904",
        "-U", sample.files.unmapped_bam,
        sample.files.sam, 
        ]

    # this is gonna catch mapped bam output from cmd2 and write to file
    cmd3 = [
        ip.bins.samtools, "sort", 
        "-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"),
        "-O", "bam", 
        "-o", sample.files.mapped_reads]

    # Later we're gonna use samtools to grab out regions using 'view'
    cmd4 = [ipyrad.bins.samtools, "index", sample.files.mapped_reads]

    # convert unmapped reads to fastq
    cmd5 = [
        ip.bins.samtools, "bam2fq",
        "-v 45",
        sample.files.unmapped_bam, 
        ]

    # Insert additional arguments for paired data to the commands.
    # We assume Illumina paired end reads for the orientation 
    # of mate pairs (orientation: ---> <----). 
    if 'pair' in data.paramsdict["datatype"]:
        # add samtools filter for only keep if both pairs hit
        # 0x1 - Read is paired
        # 0x2 - Each read properly aligned
        cmd2.insert(2, "0x3")
        cmd2.insert(2, "-f")

        # tell bam2fq that there are output files for each read pair
        cmd5.insert(2, os.path.join(
            data.dirs.edits, sample.name + "-tmp-umap1.fastq"))
        cmd5.insert(2, "-1")
        cmd5.insert(2, os.path.join(
            data.dirs.edits, sample.name + "-tmp-umap2.fastq"))
        cmd5.insert(2, "-2")
    else:
        cmd5.insert(2, sample.files.unmapped_reads)
        cmd5.insert(2, "-0")


    # cmd2 writes to sname.unmapped.bam and fills pipe with mapped BAM data
    LOGGER.debug(" ".join(cmd2))
    proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)

    # cmd3 pulls mapped BAM from pipe and writes to sname.mapped-sorted.bam
    LOGGER.debug(" ".join(cmd3))
    proc3 = sps.Popen(cmd3, 
        stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)
    error3 = proc3.communicate()[0]
    if proc3.returncode:
        raise IPyradWarningExit(error3)
    proc2.stdout.close()

    # cmd4 indexes the bam file 
    LOGGER.debug(" ".join(cmd4))
    proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)
    error4 = proc4.communicate()[0]
    if proc4.returncode:
        raise IPyradWarningExit(error4)
    
    # Running cmd5 writes to either edits/sname-refmap_derep.fastq for SE
    # or it makes edits/sname-tmp-umap{12}.fastq for paired data, which 
    # will then need to be merged.
    LOGGER.debug(" ".join(cmd5))
    proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)
    error5 = proc5.communicate()[0]
    if proc5.returncode:
        raise IPyradWarningExit(error5)
In [289]:
bwa_map(data, sample, 2, 1)
In [ ]:

In [205]:
cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, 2)),
    "-M",
    data.paramsdict['reference_sequence'], 
    sample.concatfiles[0][0],
    sample.concatfiles[0][1] or "",
    ] 

proc1 = sps.Popen(cmd1)#, stderr=cmd1_stderr, stdout=cmd1_stdout)
proc1.communicate()
Out[205]:
(None, None)
In [ ]:
pysam.sort("-m", "1000000", "-o", "output.bam", "ex1.bam")
In [182]:
iterreg = samfile.fetch("MT", 5009, 5100)
iterreg = samfile.fetch("MT", 10109, 10200)
iterreg = samfile.fetch("MT", 285510, 285600)
In [181]:
for read in iterreg:
    print(read.seq)
    print(read.compare(read.get_reference_sequence()))
TGCAGACCATAATGGCATGGTGCCACAGCTAGGGACCGATCTCTTTCTAAACTACATCCCCTAGGTTGAGACCCATGCGTGCTCTAATTGG
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-181-b00faa645bf8> in <module>()
      1 for read in iterreg:
      2     print(read.seq)
----> 3     print(read.compare(read.get_reference_sequence()))

TypeError: Argument 'other' has incorrect type (expected pysam.libcalignedsegment.AlignedSegment, got str)
In [104]:
it = samfile.pileup("MT", 5009, 5100)
pysam.Pileup.
Out[104]:
<pysam.libcalignmentfile.IteratorColumnRegion at 0x7fc1fa046558>
In [102]:
for read in iterreg:
    print(read.aligned_pairs)
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
[(0, 10109), (1, 10110), (2, 10111), (3, 10112), (4, 10113), (5, 10114), (6, 10115), (7, 10116), (8, 10117), (9, 10118), (10, 10119), (11, 10120), (12, 10121), (13, 10122), (14, 10123), (15, 10124), (16, 10125), (17, 10126), (18, 10127), (19, 10128), (20, 10129), (21, 10130), (22, 10131), (23, 10132), (24, 10133), (25, 10134), (26, 10135), (27, 10136), (28, 10137), (29, 10138), (30, 10139), (31, 10140), (32, 10141), (33, 10142), (34, 10143), (35, 10144), (36, 10145), (37, 10146), (38, 10147), (39, 10148), (40, 10149), (41, 10150), (42, 10151), (43, 10152), (44, 10153), (45, 10154), (46, 10155), (47, 10156), (48, 10157), (49, 10158), (50, 10159), (51, 10160), (52, 10161), (53, 10162), (54, 10163), (55, 10164), (56, 10165), (57, 10166), (58, 10167), (59, 10168), (60, 10169), (61, 10170), (62, 10171), (63, 10172), (64, 10173), (65, 10174), (66, 10175), (67, 10176), (68, 10177), (69, 10178), (70, 10179), (71, 10180), (72, 10181), (73, 10182), (74, 10183), (75, 10184), (76, 10185), (77, 10186), (78, 10187), (79, 10188), (80, 10189), (81, 10190), (82, 10191), (83, 10192), (84, 10193), (85, 10194), (86, 10195), (87, 10196), (88, 10197), (89, 10198), (90, 10199)]
In [87]:
rdict = {}
for read in iterreg:
    if read.qname not in rdict:
        rdict[read.qname] = read
In [80]:
read.seq
Out[80]:
'TGCAGTTTAACTGTTCAAGTTGGCAAGATCAAGTCGTCCCTAGCCCCCGCGTCCGTTTTTACCTGGTCGCGGTCCCGACCCAGCTGCCCCC'
In [84]:
read.cigar
Out[84]:
[(0, 91)]
In [77]:
sorted(
    rdict.keys(),
    key=lambda x: int(x.split(""))
Out[77]:
dict_keys(['lane1_locus0_1B_0_0', 'lane1_locus0_1B_0_1', 'lane1_locus0_1B_0_2', 'lane1_locus0_1B_0_3', 'lane1_locus0_1B_0_4', 'lane1_locus0_1B_0_5', 'lane1_locus0_1B_0_6', 'lane1_locus0_1B_0_7', 'lane1_locus0_1B_0_8', 'lane1_locus0_1B_0_9', 'lane1_locus0_1B_0_10', 'lane1_locus0_1B_0_11', 'lane1_locus0_1B_0_12', 'lane1_locus0_1B_0_13', 'lane1_locus0_1B_0_14', 'lane1_locus0_1B_0_15', 'lane1_locus0_1B_0_16', 'lane1_locus0_1B_0_17', 'lane1_locus0_1B_0_18', 'lane1_locus0_1B_0_19', 'lane1_locus0_1B_0_20'])
In [65]:
it = samfile.pileup('MT', 5009, 5100)
In [71]:
clusts = []
nclusts = 0
for region in regions:
    chrom, pos1, pos2 = region.split()
    
    clust = fetch_cluster_se(data, samfile, chrom, int(pos1), int(pos2))
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-71-06a117ea708d> in <module>()
      4     chrom, pos1, pos2 = region.split()
      5 
----> 6     clust = fetch_cluster_se(data, samfile, chrom, int(pos1), int(pos2))

~/Documents/ipyrad/ipyrad/assemble/refmap.py in fetch_cluster_se(data, samfile, chrom, rstart, rend)
    368     ## sort dict keys so highest derep is first ('seed')
    369     sfunc = lambda x: int(x.split(";size=")[1].split(";")[0])
--> 370     rkeys = sorted(rdict.keys(), key=sfunc, reverse=True)
    371 
    372     ## get blocks from the seed for filtering, bail out if seed is not paired

~/Documents/ipyrad/ipyrad/assemble/refmap.py in <lambda>(x)
    367 
    368     ## sort dict keys so highest derep is first ('seed')
--> 369     sfunc = lambda x: int(x.split(";size=")[1].split(";")[0])
    370     rkeys = sorted(rdict.keys(), key=sfunc, reverse=True)
    371 

IndexError: list index out of range
In [ ]:
def fetch_cluster_se(data, samfile, chrom, rstart, rend):
    """
    Builds a single end cluster from the refmapped data.
    """

    ## If SE then we enforce the minimum overlap distance to avoid the
    ## staircase syndrome of multiple reads overlapping just a little.
    overlap_buffer = data._hackersonly["min_SE_refmap_overlap"]

    ## the *_buff variables here are because we have to play patty
    ## cake here with the rstart/rend vals because we want pysam to
    ## enforce the buffer for SE, but we want the reference sequence
    ## start and end positions to print correctly for downstream.
    rstart_buff = rstart + overlap_buffer
    rend_buff = rend - overlap_buffer

    ## Reads that map to only very short segements of the reference
    ## sequence will return buffer end values that are before the
    ## start values causing pysam to complain. Very short mappings.
    if rstart_buff > rend_buff:
        tmp = rstart_buff
        rstart_buff = rend_buff
        rend_buff = tmp
    ## Buffering can't make start and end equal or pysam returns nothing.
    if rstart_buff == rend_buff:
        rend_buff += 1

    ## store pairs
    rdict = {}
    clust = []
    iterreg = []

    iterreg = samfile.fetch(chrom, rstart_buff, rend_buff)

    ## use dict to match up read pairs
    for read in iterreg:
        if read.qname not in rdict:
            rdict[read.qname] = read

    ## sort dict keys so highest derep is first ('seed')
    sfunc = lambda x: int(x.split(";size=")[1].split(";")[0])
    rkeys = sorted(rdict.keys(), key=sfunc, reverse=True)

    ## get blocks from the seed for filtering, bail out if seed is not paired
    try:
        read1 = rdict[rkeys[0]]
    except ValueError:
        LOGGER.error("Found bad cluster, skipping - key:{} rdict:{}".format(rkeys[0], rdict))
        return ""

    ## the starting blocks for the seed
    poss = read1.get_reference_positions(full_length=True)
    seed_r1start = min(poss)
    seed_r1end = max(poss)

    ## store the seed -------------------------------------------
    if read1.is_reverse:
        seq = revcomp(read1.seq)
    else:
        seq = read1.seq

    ## store, could write orient but just + for now.
    size = sfunc(rkeys[0])
    clust.append(">{}:{}:{};size={};*\n{}"\
        .format(chrom, seed_r1start, seed_r1end, size, seq))

    ## If there's only one hit in this region then rkeys will only have
    ## one element and the call to `rkeys[1:]` will raise. Test for this.
    if len(rkeys) > 1:
        ## store the hits to the seed -------------------------------
        for key in rkeys[1:]:
            skip = False
            try:
                read1 = rdict[key]
            except ValueError:
                ## enter values that will make this read get skipped
                read1 = rdict[key][0]
                skip = True

            ## orient reads only if not skipping
            if not skip:
                poss = read1.get_reference_positions(full_length=True)
                minpos = min(poss)
                maxpos = max(poss)
                ## store the seq
                if read1.is_reverse:
                    seq = revcomp(read1.seq)
                else:
                    seq = read1.seq
                ## store, could write orient but just + for now.
                size = sfunc(key)
                clust.append(">{}:{}:{};size={};+\n{}"\
                    .format(chrom, minpos, maxpos, size, seq))
            else:
                ## seq is excluded, though, we could save it and return
                ## it as a separate cluster that will be aligned separately.
                pass

    return clust    
In [62]:
cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, nthreads)),
    "-M",
    data.paramsdict['reference_sequence'], 
    ] 
cmd1 += [i for i in sample.concatfiles[0] if i]

# Insert optional flags for bwa
bwa_args = data._hackersonly["bwa_args"].split()
bwa_args.reverse()
for arg in bwa_args:
    cmd1.insert(2, arg)

cmd1_stdout_handle = os.path.join(
    data.dirs.refmapping, sample.name + ".sam")
cmd1_stdout = open(cmd1_stdout_handle, 'w')
cmd1_stderr = None

proc1 = sps.Popen(cmd1, stderr=cmd1_stderr, stdout=cmd1_stdout)
In [63]:
error1 = proc1.communicate()[0]
In [65]:
cmd2 = [
    ip.bins.samtools, "view", 
    "-b", 
    #"-q", "30",
    "-F", "0x904",
    "-U", os.path.join(
        data.dirs.refmapping, sample.name + "-unmapped.bam"), 
    os.path.join(data.dirs.refmapping, sample.name + ".sam")]
In [69]:
proc2 = sps.Popen(cmd2, stderr=sps.STDOUT, stdout=sps.PIPE)
#proc2.communicate()
In [74]:
proc3 = sps.Popen(cmd3, stderr=sps.STDOUT, stdout=sps.PIPE, stdin=proc2.stdout)
proc3.communicate()
Out[74]:
(b'', None)
In [77]:
proc4 = sps.Popen(cmd4, stderr=sps.STDOUT, stdout=sps.PIPE)
proc4.communicate()
Out[77]:
(b'', None)
In [81]:
    sample.files.unmapped_reads = os.path.join(
        data.dirs.refmapping,
        "{}-unmapped.fastq".format(sample.name))

    
    cmd4 = [ip.bins.samtools, "index", sample.files.mapped_reads]

    # this is gonna read in the unmapped files, args are added below, 
    # and it will output fastq formatted unmapped reads for merging.
    # -v 45 sets the default qscore arbitrarily high
    cmd5 = [
        ip.bins.samtools, "bam2fq",
        "-v 45",
        os.path.join(data.dirs.refmapping, sample.name + "-unmapped.bam")]

    # Insert additional arguments for paired data to the commands.
    # We assume Illumina paired end reads for the orientation 
    # of mate pairs (orientation: ---> <----). 
    if 'pair' in data.paramsdict["datatype"]:
        # add samtools filter for only keep if both pairs hit
        # 0x1 - Read is paired
        # 0x2 - Each read properly aligned
        cmd2.insert(2, "0x3")
        cmd2.insert(2, "-f")

        # tell bam2fq that there are output files for each read pair
        cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap1.fastq"))
        cmd5.insert(2, "-1")
        cmd5.insert(2, os.path.join(data.dirs.edits, sample.name + "-tmp-umap2.fastq"))
        cmd5.insert(2, "-2")
    else:
        cmd5.insert(2, sample.files.unmapped_reads)
        cmd5.insert(2, "-0")
In [82]:
cmd5
Out[82]:
['/home/deren/Documents/ipyrad/bin/samtools-linux-x86_64',
 'bam2fq',
 '-0',
 '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-unmapped.fastq',
 '-v 45',
 '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-unmapped.bam']
In [83]:
proc5 = sps.Popen(cmd5, stderr=sps.STDOUT, stdout=sps.PIPE)
error5 = proc5.communicate()[0]
In [73]:
sample.files.mapped_reads = os.path.join(
    data.dirs.refmapping,
    "{}-mapped-sorted.bam".format(sample.name))

# this is gonna catch mapped bam output from cmd2 and write to file
cmd3 = [
    ip.bins.samtools, "sort", 
    "-T", os.path.join(data.dirs.refmapping, sample.name + ".sam.tmp"),
    "-O", "bam", 
    "-o", sample.files.mapped_reads]
cmd3
Out[73]:
['/home/deren/Documents/ipyrad/bin/samtools-linux-x86_64',
 'sort',
 '-T',
 '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0.sam.tmp',
 '-O',
 'bam',
 '-o',
 '/home/deren/Documents/ipyrad/tests/3-refsetest_refmapping/3K_0-mapped-sorted.bam']
In [53]:
data.dirs.refmapping
Out[53]:
''
In [17]:
nthreads = 2


cmd1 = [
    ip.bins.bwa, "mem",
    "-t", str(max(1, nthreads)),
    "-M",
    data.paramsdict['reference_sequence']
    ] 

cmd1
#cmd1 += sample.files.dereps
Out[17]:
['/home/deren/Documents/ipyrad/bin/bwa-linux-x86_64',
 'mem',
 '-t',
 '2',
 '-M',
 '/home/deren/Documents/ipyrad/tests/ipsimdata/rad_example_genome.fa']
In [ ]:
 
In [4]:
s3 = Step3(data, list(data.samples.values()), 0, 5, True, ipyclient)
sample = s3.samples[0]
s3.run()
[####################] 100% 0:00:01 | dereplicating      | s3 |
[####################] 100% 0:00:01 | clustering/mapping | s3 |
[####################] 100% 0:00:00 | building clusters  | s3 |
[####################] 100% 0:00:00 | chunking clusters  | s3 |
[####################] 100% 0:00:15 | aligning clusters  | s3 |
[####################] 100% 0:00:00 | concat clusters    | s3 |
In [7]:
#derep_sort_map(s3.data, sample, s3.force, s3.nthreads)
sample.concatfiles = concat_multiple_edits(s3.data, sample)
In [13]:
sample.mergedfile = merge_pairs(s3.data, sample, 1, 1)  
sample.mergedfile
Out[13]:
[('/home/deren/Documents/ipyrad/tests/2-setest_edits/3K_0.trimmed_R1_.fastq.gz',
  0)]
In [12]:
new_derep_and_sort(s3.data, sample.mergedfile, os.path.join(s3.data.tmpdir, sample.name+ "_derep.fastq"), s3.nthreads)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-300134f582b0> in <module>()
----> 1 new_derep_and_sort(s3.data, sample.mergedfile, os.path.join(s3.data.tmpdir, sample.name+ "_derep.fastq"), s3.nthreads)

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in new_derep_and_sort(data, infile, outfile, nthreads)
    427 
    428     ## build PIPEd job
--> 429     proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
    430     errmsg = proc.communicate()[0]
    431     if proc.returncode:

~/miniconda3/lib/python3.6/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors)
    707                                 c2pread, c2pwrite,
    708                                 errread, errwrite,
--> 709                                 restore_signals, start_new_session)
    710         except:
    711             # Cleanup if the child failed starting.

~/miniconda3/lib/python3.6/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
   1273                             errread, errwrite,
   1274                             errpipe_read, errpipe_write,
-> 1275                             restore_signals, start_new_session, preexec_fn)
   1276                     self._child_created = True
   1277                 finally:

TypeError: expected str, bytes or os.PathLike object, not list
In [ ]:
strand = "plus"
if s3.data.paramsdict["datatype"] is ('gbs' or '2brad'):
    strand = "both"


cmd = [
    ip.bins.vsearch,
    "--derep_fulllength", sample.mergedfile,
    "--strand", strand,
    "--output", outfile,
    "--threads", str(nthreads),
    "--fasta_width", str(0),
    "--fastq_qmax", "1000",
    "--sizeout", 
    "--relabel_md5",
    ]
In [14]:
s3.data.run("4")
Assembly: pairtest
[####################] 100% 0:00:00 | inferring [H, E]     | s4 |

  Encountered an unexpected error (see ./ipyrad_log.txt)
  Error message is below -------------------------------
The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
In [14]:
s3.remote_run_dereps()
#s3.remote_run_cluster_map_build()
[####################] 100% 0:00:01 | dereplicating      | s3 |
In [15]:
for sample in s3.samples:
    cluster(s3.data, sample, s3.nthreads, s3.force)
In [8]:
for sample in s3.samples:
    build_clusters(s3.data, sample, s3.maxindels)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-8-dad8bccabeaa> in <module>()
      1 for sample in s3.samples:
----> 2     build_clusters(s3.data, sample, s3.maxindels)

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in build_clusters(data, sample, maxindels)
    872                         key=lambda x:
    873                             int(x.split(";size=")[-1].split("\n")[:-1]),
--> 874                         reverse=True)                    
    875                     seqlist.append("\n".join(fseqs))
    876                     seqsize += 1

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in <lambda>(x)
    871                     fseqs = [fseqs[0]] + sorted(fseqs[1:], 
    872                         key=lambda x:
--> 873                             int(x.split(";size=")[-1].split("\n")[:-1]),
    874                         reverse=True)                    
    875                     seqlist.append("\n".join(fseqs))

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'
In [26]:
for sample in s3.samples:
    muscle_chunker(s3.data, sample)
In [33]:
aasyncs = {}
for sample in s3.samples:
    aasyncs[sample.name] = []
    for idx in range(10):
        handle = os.path.join(s3.data.tmpdir, 
            "{}_chunk_{}.ali".format(sample.name, idx))
        align_and_parse(handle, s3.maxindels, s3.gbs)
In [32]:
def _get_derep_num(read):
    "return the number of replicates in a derep read"
    return int(read.split("=")[-1].split("\n")[0][:-1])




def _aligned_indel_filter(clust, max_internal_indels):
    """ checks for too many internal indels in muscle aligned clusters """

    ## make into list
    lclust = clust.split()
    
    ## paired or not
    try:
        seq1 = [i.split("nnnn")[0] for i in lclust[1::2]]
        seq2 = [i.split("nnnn")[1] for i in lclust[1::2]]
        intindels1 = [i.rstrip("-").lstrip("-").count("-") for i in seq1]
        intindels2 = [i.rstrip("-").lstrip("-").count("-") for i in seq2]
        intindels = intindels1 + intindels2
        if max(intindels) > max_internal_indels:
            return 1
    except IndexError:
        seq1 = lclust[1::2]
        intindels = [i.rstrip("-").lstrip("-").count("-") for i in seq1]
        if max(intindels) > max_internal_indels:
            return 1     
    return 0
In [34]:
for sample in s3.samples:
    reconcat(s3.data, sample)
In [27]:
def align_and_parse(handle, max_internal_indels=5, is_gbs=False):
    """ much faster implementation for aligning chunks """

    ## data are already chunked, read in the whole thing. bail if no data.
    try:
        with open(handle, 'rb') as infile:
            clusts = infile.read().decode().split("//\n//\n")
            # remove any empty spots
            clusts = [i for i in clusts if i]
            # Skip entirely empty chunks
            if not clusts:
                raise IPyradError("no clusters in file: {}".format(handle))

    except (IOError, IPyradError):
        LOGGER.debug("skipping empty chunk - {}".format(handle))
        return 0

    ## count discarded clusters for printing to stats later
    highindels = 0

    ## iterate over clusters sending each to muscle, splits and aligns pairs
    aligned = _persistent_popen_align3(clusts, 200, is_gbs)

    ## store good alignments to be written to file
    refined = []

    ## filter and trim alignments
    for clust in aligned:
        # check for too many internal indels
        if not _aligned_indel_filter(clust, max_internal_indels):
            refined.append(clust)
        else:
            highindels += 1

    ## write to file after
    if refined:
        outhandle = handle.rsplit(".", 1)[0] + ".aligned"
        with open(outhandle, 'wb') as outfile:
            outfile.write(str.encode("\n//\n//\n".join(refined) + "\n"))

    ## remove the old tmp file
    if not LOGGER.getEffectiveLevel() == 10:
        os.remove(handle)
    return highindels
In [28]:
def _persistent_popen_align3(clusts, maxseqs=200, is_gbs=False):
    """ keeps a persistent bash shell open and feeds it muscle alignments """

    ## create a separate shell for running muscle in, this is much faster
    ## than spawning a separate subprocess for each muscle call
    proc = sps.Popen(
        ["bash"], 
        stdin=sps.PIPE, 
        stdout=sps.PIPE, 
        bufsize=0,
        )

    ## iterate over clusters in this file until finished
    aligned = []
    for clust in clusts:

        ## new alignment string for read1s and read2s
        align1 = []
        align2 = []

        ## don't bother aligning if only one seq
        if clust.count(">") == 1:
            aligned.append(clust.replace(">", "").strip())
        else:

            # do we need to split the alignment? (is there a PE insert?)
            try:
                # make into list (only read maxseqs lines, 2X cuz names)
                lclust = clust.split()[:maxseqs * 2]

                # try to split cluster list at nnnn separator for each read
                lclust1 = list(chain(*zip(
                    lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]])))
                lclust2 = list(chain(*zip(
                    lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]])))

                # put back into strings
                clust1 = "\n".join(lclust1)
                clust2 = "\n".join(lclust2)

                # Align the first reads.
                # The muscle command with alignment as stdin and // as split
                cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                        .format(clust1, ip.bins.muscle, "//\n"))

                # send cmd1 to the bash shell
                proc.stdin.write(cmd1.encode())

                # read the stdout by line until splitter is reached
                # meaning that the alignment is finished.
                for line in iter(proc.stdout.readline, b'//\n'):
                    align1.append(line.decode())

                # Align the second reads.
                # The muscle command with alignment as stdin and // as split
                cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                        .format(clust2, ip.bins.muscle, "//\n"))

                # send cmd2 to the bash shell
                proc.stdin.write(cmd2.encode())

                # read the stdout by line until splitter is reached
                # meaning that the alignment is finished.
                for line in iter(proc.stdout.readline, b'//\n'):
                    align2.append(line.decode())

                # join up aligned read1 and read2 and ensure names order match
                lines1 = "".join(align1)[1:].split("\n>")
                lines2 = "".join(align2)[1:].split("\n>")
                dalign1 = dict([i.split("\n", 1) for i in lines1])
                dalign2 = dict([i.split("\n", 1) for i in lines2])

                # sort the first reads
                keys = list(dalign1.keys())
                seed = [i for i in keys if i[-1] == "*"][0]
                keys.pop(keys.index(seed))
                order = [seed] + sorted(
                    keys, key=_get_derep_num, reverse=True)                

                # combine in order
                alignpe = []                
                for key in order:
                    alignpe.append("\n".join([
                        key, 
                        dalign1[key].replace("\n", "") + "nnnn" + \
                        dalign2[key].replace("\n", "")]))

                ## append aligned cluster string
                aligned.append("\n".join(alignpe).strip())

            # Malformed clust. Dictionary creation with only 1 element 
            except ValueError as inst:
                ip.logger.debug(
                    "Bad PE cluster - {}\nla1 - {}\nla2 - {}"
                    .format(clust, lines1, lines2))

            ## Either reads are SE, or at least some pairs are merged.
            except IndexError:
                    
                # limit the number of input seqs
                # use lclust already built before checking pairs
                lclust = "\n".join(clust.split()[:maxseqs * 2])

                # the muscle command with alignment as stdin and // as splitter
                cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                       .format(lclust, ip.bins.muscle, "//\n"))

                ## send cmd to the bash shell (TODO: PIPE could overflow here!)
                proc.stdin.write(cmd.encode())

                ## read the stdout by line until // is reached. This BLOCKS.
                for line in iter(proc.stdout.readline, b'//\n'):
                    align1.append(line.decode())

                ## remove '>' from names, and '\n' from inside long seqs                
                lines = "".join(align1)[1:].split("\n>")

                ## find seed of the cluster and put it on top.
                seed = [i for i in lines if i.split(";")[-1][0] == "*"][0]
                lines.pop(lines.index(seed))
                lines = [seed] + sorted(
                    lines, key=_get_derep_num, reverse=True)

                ## format remove extra newlines from muscle
                aa = [i.split("\n", 1) for i in lines]
                align1 = [i[0] + '\n' + "".join([j.replace("\n", "") 
                          for j in i[1:]]) for i in aa]
                
                # trim edges in sloppy gbs/ezrad data. 
                # Maybe relevant to other types too...
                if is_gbs:
                    align1 = _gbs_trim(align1)

                ## append to aligned
                aligned.append("\n".join(align1))
               
    # cleanup
    proc.stdout.close()
    if proc.stderr:
        proc.stderr.close()
    proc.stdin.close()
    proc.wait()

    ## return the aligned clusters
    return aligned  
In [35]:
for sample in s3.samples:
    ip.assemble.clustmap._sample_cleanup(s3.data, sample)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-35-a8a5a2cf58f8> in <module>()
      1 for sample in s3.samples:
----> 2     ip.assemble.clustmap._sample_cleanup(s3.data, sample)

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in _sample_cleanup(data, sample)
   1262 
   1263     # get maxlen and depths array from clusters
-> 1264     maxlens, depths = _get_quick_depths(data, sample)
   1265 
   1266     try:

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in _get_quick_depths(data, sample)
   1250 
   1251         else:
-> 1252             tdepth += int(name.split(";")[-2][5:])
   1253             tlen = len(seq)
   1254 

ValueError: invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'
In [36]:
ip.assemble.clustmap._get_quick_depths(s3.data, sample)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-36-93c29eef65ac> in <module>()
----> 1 ip.assemble.clustmap._get_quick_depths(s3.data, sample)

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in _get_quick_depths(data, sample)
   1250 
   1251         else:
-> 1252             tdepth += int(name.split(";")[-2][5:])
   1253             tlen = len(seq)
   1254 

ValueError: invalid literal for int() with base 10: 'f72cb70f788295d6632d5d18442'
In [37]:
if sample.files.get('clusters'):
    pass
In [43]:
fclust = data.samples[sample.name].files.clusters
clusters = gzip.open(fclust, 'rt')
pairdealer = izip(*[iter(clusters)] * 2)

## storage
depths = []
maxlen = []

## start with cluster 0
tdepth = 0
tlen = 0

## iterate until empty
while 1:
    ## grab next
    try:
        name, seq = next(pairdealer)
    except StopIteration:
        break

    ## if not the end of a cluster
    #print name.strip(), seq.strip()
    #print(name)
    if name.strip() == seq.strip():
        depths.append(tdepth)
        maxlen.append(tlen)
        tlen = 0
        tdepth = 0

    else:
        tdepth += int(name.strip().split("=")[-1][:-1])
        tlen = len(seq)
In [12]:
sample.name
Out[12]:
'1D_0'
In [ ]:
 
In [6]:
s3.remote_run_dereps()
[####################] 100% 0:00:02 | dereplicating      | s3 |
In [7]:
s3.remote_run_cluster_map_build()
[####################] 100% 0:00:05 | clustering/mapping | s3 |
[####################] 100% 0:00:00 | building clusters  | s3 |
[####################] 100% 0:00:00 | chunking clusters  | s3 |
In [8]:
s3.remote_run_align_cleanup()
[####################] 100% 0:00:02 | aligning clusters  | s3 |
---------------------------------------------------------------------------
IPyradError                               Traceback (most recent call last)
<ipython-input-8-2f667c287daa> in <module>()
----> 1 s3.remote_run_align_cleanup()

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in remote_run_align_cleanup(self)
    333         for job in allasyncs:
    334             if not job.successful():
--> 335                 raise IPyradError(job.exception())
    336 
    337         # track job progress

IPyradError: TypeError(a bytes-like object is required, not 'str')
In [90]:
handle = "pairtest-tmpalign/1A_0_chunk_0.ali"
with open(handle, 'rb') as infile:
    clusts = infile.read().decode().split("//\n//\n")
    # remove any empty spots
    clusts = [i for i in clusts if i]
    # Skip entirely empty chunks
    if not clusts:
        raise IPyradError("no clusters in file: {}".format(handle))
In [91]:
maxseqs = 200
is_gbs = False
In [ ]:
proc = sps.Popen(
    ["bash"], 
    stdin=sps.PIPE, 
    stdout=sps.PIPE, 
    bufsize=0,
    )

## iterate over clusters in this file until finished
aligned = []
for clust in clusts:

    ## new alignment string for read1s and read2s
    align1 = []
    align2 = []

    ## don't bother aligning if only one seq
    if clust.count(">") == 1:
        aligned.append(clust.replace(">", "").strip())
    else:

        # do we need to split the alignment? (is there a PE insert?)
        try:
            # make into list (only read maxseqs lines, 2X cuz names)
            lclust = clust.split()[:maxseqs * 2]

            # try to split cluster list at nnnn separator for each read
            lclust1 = list(chain(*zip(
                lclust[::2], [i.split("nnnn")[0] for i in lclust[1::2]])))
            lclust2 = list(chain(*zip(
                lclust[::2], [i.split("nnnn")[1] for i in lclust[1::2]])))

            # put back into strings
            clust1 = "\n".join(lclust1)
            clust2 = "\n".join(lclust2)

            # Align the first reads.
            # The muscle command with alignment as stdin and // as split
            cmd1 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(clust1, ip.bins.muscle, "//\n"))

            # send cmd1 to the bash shell
            proc.stdin.write(cmd1.encode())

            # read the stdout by line until splitter is reached
            # meaning that the alignment is finished.
            for line in iter(proc.stdout.readline, '//\n'):
                align1.append(line)

            # Align the second reads.
            # The muscle command with alignment as stdin and // as split
            cmd2 = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                    .format(clust2, ip.bins.muscle, "//\n"))

            # send cmd2 to the bash shell
            proc.stdin.write(cmd2.encode())

            # read the stdout by line until splitter is reached
            # meaning that the alignment is finished.
            for line in iter(proc.stdout.readline, b'//\n'):
                align2.append(line)

            # join up aligned read1 and read2 and ensure names order match
            lines1 = "".join(align1)[1:].split("\n>")
            lines2 = "".join(align2)[1:].split("\n>")
            dalign1 = dict([i.split("\n", 1) for i in lines1])
            dalign2 = dict([i.split("\n", 1) for i in lines2])

            # sort the first reads
            keys = list(dalign1.keys())
            seed = [i for i in keys if i[-1] == "*"][0]
            keys.pop(keys.index(seed))
            order = [seed] + sorted(
                keys, key=_get_derep_num, reverse=True)                

            # combine in order
            for key in order:
                align1.append("\n".join([
                    key, 
                    dalign1[key].replace("\n", "") + "nnnn" + \
                    dalign2[key].replace("\n", "")]))

            ## append aligned cluster string
            aligned.append("\n".join(align1).strip())

        # Malformed clust. Dictionary creation with only 1 element 
        except ValueError as inst:
            ip.logger.debug(
                "Bad PE cluster - {}\nla1 - {}\nla2 - {}"
                .format(clust, lines1, lines2))

        ## Either reads are SE, or at least some pairs are merged.
        except IndexError:

            # limit the number of input seqs
            # use lclust already built before checking pairs
            lclust = "\n".join(clust.split()[:maxseqs * 2])

            # the muscle command with alignment as stdin and // as splitter
            cmd = ("echo -e '{}' | {} -quiet -in - ; echo {}"
                   .format(lclust, ip.bins.muscle, "//\n"))

            ## send cmd to the bash shell (TODO: PIPE could overflow here!)
            proc.stdin.write(cmd.encode())

            ## read the stdout by line until // is reached. This BLOCKS.
            for line in iter(proc.stdout.readline, b'//\n'):
                align1.append(line.decode())

            ## remove '>' from names, and '\n' from inside long seqs                
            lines = "".join(align1)[1:].split("\n>")

            ## find seed of the cluster and put it on top.
            seed = [i for i in lines if i.split(";")[-1][0] == "*"][0]
            lines.pop(lines.index(seed))
            lines = [seed] + sorted(
                lines, key=_get_derep_num, reverse=True)

            ## format remove extra newlines from muscle
            aa = [i.split("\n", 1) for i in lines]
            align1 = [i[0] + '\n' + "".join([j.replace("\n", "") 
                      for j in i[1:]]) for i in aa]

            # trim edges in sloppy gbs/ezrad data. 
            # Maybe relevant to other types too...
            if is_gbs:
                align1 = _gbs_trim(align1)

            ## append to aligned
            aligned.append("\n".join(align1))

# cleanup
proc.stdout.close()
if proc.stderr:
    proc.stderr.close()
proc.stdin.close()
proc.wait()

## return the aligned clusters
#return aligned   
In [ ]:
 
In [37]:
from ipyrad.assemble.clustmap import _get_derep_num
In [80]:
# join up aligned read1 and read2 and ensure names order match
lines1 = "".join(align1)[1:].split("\n>")
lines2 = "".join(align2)[1:].split("\n>")
dalign1 = dict([i.split("\n", 1) for i in lines1])
dalign2 = dict([i.split("\n", 1) for i in lines2])

# sort the first reads
keys = list(dalign1.keys())
seed = [i for i in keys if i[-1] == "*"][0]
keys.pop(keys.index(seed))
order = [seed] + sorted(
    keys, key=_get_derep_num, reverse=True)     
In [82]:
    # join up aligned read1 and read2 and ensure names order match
    lines1 = "".join(align1)[1:].split("\n>")
    lines2 = "".join(align2)[1:].split("\n>")
    dalign1 = dict([i.split("\n", 1) for i in lines1])
    dalign2 = dict([i.split("\n", 1) for i in lines2])

    # sort the first reads
    keys = list(dalign1.keys())
    seed = [i for i in keys if i[-1] == "*"][0]
    keys.pop(keys.index(seed))
    order = [seed] + sorted(
        keys, key=_get_derep_num, reverse=True)                

    # combine in order
    for key in order:
        align1.append("\n".join([
            key, 
            dalign1[key].replace("\n", "") + "nnnn" + \
            dalign2[key].replace("\n", "")]))

    ## append aligned cluster string
    aligned.append("\n".join(align1).strip())
In [83]:
aligned
Out[83]:
['>12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\n\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\n\nTATATACAAAGACGGATGTGGTGCGAAATAC\n\n>1f4df54209f2d8d9be6019fde95d7579;size=1;+\n\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\n\nTATATACAAAGACGGATGTGGTGCGAAATAC\n\n>4b85ee58466075729a68837e2b016ad7;size=1;+\n\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\n\nTATATACAAAGACGGATGTGGTGCGAAATAC\n\n12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTGAACGTCCCGATCACAGCGTTGGCTCTACTCCG\n1f4df54209f2d8d9be6019fde95d7579;size=1;+\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTGAACGTCCCGATCACAGCGTTCGCTCTACTCCG\n4b85ee58466075729a68837e2b016ad7;size=1;+\nTGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGTTATATACAAAGACGGATGTGGTGCGAAATACnnnnGGCTCCTATTTCAAGTACCGTCTAATGTCAATAAGATGGTTTCGATGCGTGGAGAGAAACCCACTCTAAACGTCCCGATCACAGCGTTGGCTCTACTCCG']
In [57]:
## remove '>' from names, and '\n' from inside long seqs                
lines1 = "".join(align1)[1:].split("\n>")
seed = [i for i in lines1 if i.split(";")[-1][0] == "*"][0]
lines1.pop(lines1.index(seed))
lines1 = [seed] + sorted(
    lines1, key=_get_derep_num, reverse=True)
dalign1 = dict([i.split("\n", 1) for i in lines1])


lines2 = "".join(align2)[1:].split("\n>")
dalign2 = dict([i.split("\n", 1) for i in lines2])
#seed = [i for i in lines2 if i.split(";")[-1][0] == "*"][0]
#lines2.pop(lines2.index(seed))
In [24]:
## format remove extra newlines from muscle
aa = [i.split("\n", 1) for i in lines]
align1 = [i[0] + '\n' + "".join([j.replace("\n", "") 
          for j in i[1:]]) for i in aa]
In [25]:
align1
Out[25]:
['>12f1ffdaa3a3d7c8310998dea05a56dd;size=22;*\n',
 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\n',
 'TATATACAAAGACGGATGTGGTGCGAAATAC\n',
 '>1f4df54209f2d8d9be6019fde95d7579;size=1;+\n',
 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\n',
 'TATATACAAAGACGGATGTGGTGCGAAATAC\n',
 '>4b85ee58466075729a68837e2b016ad7;size=1;+\n',
 'TGCAGCCTGGTCAATAGCCCCCAATTGGTCGATCCCGTTTATACTTGCAGAACAAATCGT\n',
 'TATATACAAAGACGGATGTGGTGCGAAATAC\n']
In [16]:
derephandle = os.path.join(data.tmpdir, sample.name + "_derep.fastq")
uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
temphandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
In [19]:
derepfile = os.path.join(data.tmpdir, sample.name + "_derep.fastq")
uhandle = os.path.join(data.dirs.clusts, sample.name + ".utemp")
usort = os.path.join(data.dirs.clusts, sample.name + ".utemp.sort")
hhandle = os.path.join(data.dirs.clusts, sample.name + ".htemp")
sample.files.clusters = os.path.join(
    data.dirs.clusts, sample.name + ".clust.gz")
clustsout = gzip.open(sample.files.clusters, 'wt')
In [20]:
cmd = ["sort", "-k", "2", uhandle, "-o", usort]
proc = sps.Popen(cmd, close_fds=True)
proc.communicate()[0]
In [21]:
alldereps = {}
with open(derepfile, 'rt') as ioderep:
    dereps = izip(*[iter(ioderep)] * 2)
    for namestr, seq in dereps:
        nnn, sss = [i.strip() for i in (namestr, seq)]  
        alldereps[nnn[1:]] = sss
In [25]:
seedsseen = set()
maxindels = 8

## Iterate through the usort file grabbing matches to build clusters
with open(usort, 'rt') as insort:
    ## iterator, seed null, seqlist null
    isort = iter(insort)
    lastseed = 0
    fseqs = []
    seqlist = []
    seqsize = 0
    while 1:
        ## grab the next line
        try:
            hit, seed, _, ind, ori, _ = next(isort).strip().split()
        except StopIteration:
            break

        ## same seed, append match
        if seed != lastseed:
            seedsseen.add(seed)
            ## store the last cluster (fseq), count it, and clear fseq
            if fseqs:
                ## sort fseqs by derep after pulling out the seed
                fseqs = [fseqs[0]] + sorted(fseqs[1:], 
                    key=lambda x: 
                    int(x.split(";size=")[1].split(";")[0]), reverse=True)                    
                seqlist.append("\n".join(fseqs))
                seqsize += 1
                fseqs = []

            # occasionally write/dump stored clusters to file and clear mem
            if not seqsize % 10000:
                if seqlist:
                    clustsout.write(
                        "\n//\n//\n".join(seqlist) + "\n//\n//\n")
                    ## reset list and counter
                    seqlist = []

            ## store the new seed on top of fseq list
            fseqs.append(">{}*\n{}".format(seed, alldereps[seed]))
            lastseed = seed

        ## add match to the seed
        ## revcomp if orientation is reversed (comp preserves nnnn)
        if ori == "-":
            seq = comp(alldereps[hit])[::-1]
        else:
            seq = alldereps[hit]
        ## only save if not too many indels
        if int(ind) <= maxindels:
            fseqs.append(">{}{}\n{}".format(hit, ori, seq))
        else:
            ip.logger.info("filtered by maxindels: %s %s", ind, seq)

## write whatever is left over to the clusts file
if fseqs:
    seqlist.append("\n".join(fseqs))
if seqlist:
    clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n")

## now write the seeds that had no hits. Make dict from htemp
with open(hhandle, 'rt') as iotemp:
    nohits = izip(*[iter(iotemp)] * 2)
    seqlist = []
    seqsize = 0
    while 1:
        try:
            nnn, _ = [i.strip() for i in next(nohits)]
        except StopIteration:
            break

        ## occasionally write to file
        if not seqsize % 10000:
            if seqlist:
                clustsout.write("\n//\n//\n".join(seqlist) + "\n//\n//\n")
                ## reset list and counter
                seqlist = []

        ## append to list if new seed
        if nnn[1:] not in seedsseen:
            seqlist.append("{}*\n{}".format(nnn, alldereps[nnn[1:]]))
            seqsize += 1

## write whatever is left over to the clusts file
if seqlist:
    clustsout.write("\n//\n//\n".join(seqlist))

## close the file handle
clustsout.close()
del alldereps
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-25-0723860e8118> in <module>()
     25                 fseqs = [fseqs[0]] + sorted(fseqs[1:], 
     26                     key=lambda x:
---> 27                     int(x.split(";size=")[1].split(";")[0]), reverse=True)                    
     28                 seqlist.append("\n".join(fseqs))
     29                 seqsize += 1

<ipython-input-25-0723860e8118> in <lambda>(x)
     25                 fseqs = [fseqs[0]] + sorted(fseqs[1:], 
     26                     key=lambda x:
---> 27                     int(x.split(";size=")[1].split(";")[0]), reverse=True)                    
     28                 seqlist.append("\n".join(fseqs))
     29                 seqsize += 1

ValueError: invalid literal for int() with base 10: '1+\nTGCAGGTCACTTTTCAAGATACACTATTGTTATTACTGTGAGACACAAAGCTAATTCATCACTTCACGGATACCGCGTCCTCCTATAACGCnnnnCAATATTAACGCGTGAGTACCGGTTTCCTTGTGAGGAAGGCCCACTCTCAGTACCACCCTTATCCTATTCTAAGGCACACATGCATAGACCACTCAACCG
In [28]:
seedsseen
Out[28]:
{'00519cc27c6ab8f71dcdef028ad184e8;size=12',
 '00a2c3fa80127d8adfc783464de05df4;size=10'}
In [6]:
sample.concatfiles = concat_multiple_edits(data, sample)
In [7]:
sample.mergedfile = merge_pairs(data, sample, 1, 1)  
In [10]:
new_derep_and_sort(
    data,
    sample.mergedfile,
    os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
    2)
In [6]:
data = data
sample = sample
revcomp = 1
vsearch_merge = 1

sample.concatfiles = concat_multiple_edits(data, sample)
sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")       
In [8]:
sample.mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")       
if 'pair' in data.paramsdict['datatype']:
    if "reference" not in data.paramsdict["assembly_method"]:
        nmerged = ip.assemble.clustmap._merge_pairs(data, sample, 1, 1)
    else:
        nmerged = 0  # _merge_pairs(data, sample, 0, 0)          
    sample.stats.reads_merged = nmerged
In [ ]:
new_derep_and_sort(data, sampl)
In [10]:
def new_derep_and_sort(data, infile, outfile, nthreads):
    """
    Dereplicates reads and sorts so reads that were highly replicated are at
    the top, and singletons at bottom, writes output to derep file. Paired
    reads are dereplicated as one concatenated read and later split again.
    Updated this function to take infile and outfile to support the double
    dereplication that we need for 3rad (5/29/15 iao).
    """
    ## datatypes options
    strand = "plus"
    if data.paramsdict["datatype"] is ('gbs' or '2brad'):
        strand = "both"

    ## do dereplication with vsearch
    cmd = [
        ip.bins.vsearch,
        "--derep_fulllength", infile,
        "--strand", strand,
        "--output", outfile,
        "--threads", str(nthreads),
        "--fasta_width", str(0),
        "--fastq_qmax", "1000",
        "--sizeout", 
        "--relabel_md5",
        ]
    ip.logger.info("derep cmd %s", cmd)

    ## build PIPEd job
    proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
    errmsg = proc.communicate()[0]
    if proc.returncode:
        ip.logger.error("error inside derep_and_sort %s", errmsg)
        raise IPyradWarningExit(errmsg)
In [ ]:
new_derep_and_sort()
In [43]:
## CONCAT FILES FOR MERGED ASSEMBLIES
mergefile = os.path.join(data.tmpdir, sample.name + "_merged_.fastq")
sample.files.edits = concat_multiple_edits(data, sample)
In [41]:
sample.files.edits = [(mergefile, )]
nmerged = ip.assemble.clustmap._merge_pairs(
    data, sample.files.edits, mergefile, 1, 1)
sample.stats.reads_merged = nmerged
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-41-50484bfab4a1> in <module>()
      1 sample.files.edits = [(mergefile, )]
      2 nmerged = ip.assemble.clustmap._merge_pairs(
----> 3     data, sample.files.edits, mergefile, 1, 1)
      4 sample.stats.reads_merged = nmerged

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in _merge_pairs(data, two_files, merged_out, revcomp, merge)
    693     else:
    694         tmp1 = two_files[0][0]
--> 695         tmp2 = two_files[0][1]
    696 
    697     try:

IndexError: tuple index out of range
In [33]:
mergefile
Out[33]:
'/home/deren/Documents/ipyrad/pairtest-tmpalign/2E_0_merged_.fastq'
In [44]:
sample.files.edits
Out[44]:
[('/home/deren/Documents/ipyrad/pairtest-tmpalign/2E_0_merged_.fastq',)]
In [5]:
s3.run()
[####################] 100% 0:00:00 | dereplicating      | s3 |
[####################] 100% 0:00:01 | clustering/mapping | s3 |
[####################] 100% 0:00:00 | building clusters  | s3 |
[####################] 100% 0:00:00 | chunking clusters  | s3 |
[####################] 100% 0:00:15 | aligning clusters  | s3 |
[####################] 100% 0:00:00 | concat clusters    | s3 |
In [5]:
# muscle align returns values for bad alignments
ip.assemble.cluster_within.sample_cleanup(data, samples[0])
In [6]:
data._build_stat("s3")
Out[6]:
clusters_total hidepth_min clusters_hidepth avg_depth_total avg_depth_mj avg_depth_stat sd_depth_total sd_depth_mj sd_depth_stat filtered_bad_align
1A_0 1000.0 6.0 1000.0 19.862 19.862 19.862 2.830717 2.830717 2.830717 0.0
1B_0 1000.0 6.0 1000.0 20.043 20.043 20.043 2.807339 2.807339 2.807339 0.0
1C_0 1000.0 6.0 1000.0 20.136 20.136 20.136 2.874283 2.874283 2.874283 0.0
1D_0 1000.0 6.0 1000.0 19.966 19.966 19.966 2.738402 2.738402 2.738402 0.0
2E_0 1000.0 6.0 1000.0 20.017 20.017 20.017 2.778977 2.778977 2.778977 0.0
2F_0 1000.0 6.0 1000.0 19.933 19.933 19.933 2.833463 2.833463 2.833463 0.0
2G_0 1000.0 6.0 1000.0 20.030 20.030 20.030 2.773644 2.773644 2.773644 0.0
2H_0 1000.0 6.0 1000.0 20.199 20.199 20.199 2.870087 2.870087 2.870087 0.0
3I_0 1000.0 6.0 1000.0 19.885 19.885 19.885 3.012603 3.012603 3.012603 0.0
3J_0 1000.0 6.0 1000.0 19.822 19.822 19.822 2.878596 2.878596 2.878596 0.0
3K_0 1000.0 6.0 1000.0 19.965 19.965 19.965 2.885788 2.885788 2.885788 0.0
3L_0 1000.0 6.0 1000.0 20.008 20.008 20.008 2.904813 2.904813 2.904813 0.0
In [7]:
raise IPyradError("hi")
---------------------------------------------------------------------------
IPyradError                               Traceback (most recent call last)
<ipython-input-7-5110f32adaa3> in <module>()
----> 1 raise IPyradError("hi")

IPyradError: hi
In [4]:
self = data
derep_sort_map(s3.data, samples[0], s3.force)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-4-c34ce6e8f70d> in <module>()
      1 self = data
----> 2 derep_sort_map(s3.data, samples[0], s3.force)

TypeError: derep_sort_map() missing 1 required positional argument: 'nthreads'
In [11]:
ra.exception()
Out[11]:
<Remote[2]:TypeError(derep_sort_map() missing 1 required positional argument: 'sample')>
In [ ]:
data.get_params()