In [1]:

import ipyrad as ip
import ipyparallel as ipp 

In [2]:

from ipyrad.assemble.clustmap import *
from ipyrad.assemble.consens_se import *

In [3]:

ipyclient = ipp.Client()

/home/deren/miniconda3/lib/python3.6/site-packages/ipyparallel/client/client.py:459: RuntimeWarning: 
            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@oud')
            or instruct your controller to listen on an external IP.
  RuntimeWarning)

In [4]:

data = ip.Assembly("test")
data.set_params("project_dir", "hotfix")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("assembly_method", "reference")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CCG"))    
data.get_params()

New Assembly: test
0   assembly_name               test                                         
1   project_dir                 ./hotfix                                     
2   raw_fastq_path              ./ipsimdata/pairddrad_example_R*_.fastq.gz   
3   barcodes_path               ./ipsimdata/pairddrad_example_barcodes.txt   
4   sorted_fastq_path                                                        
5   assembly_method             reference                                    
6   reference_sequence          ./ipsimdata/pairddrad_example_genome.fa      
7   datatype                    pairddrad                                    
8   restriction_overhang        ('TGCAG', 'CCG')                             
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            6                                            
13  maxdepth                    10000                                        
14  clust_threshold             0.85                                         
15  max_barcode_mismatch        0                                            
16  filter_adapters             0                                            
17  filter_min_trim_len         35                                           
18  max_alleles_consens         2                                            
19  max_Ns_consens              (5, 5)                                       
20  max_Hs_consens              (8, 8)                                       
21  min_samples_locus           4                                            
22  max_SNPs_locus              (20, 20)                                     
23  max_Indels_locus            (8, 8)                                       
24  max_shared_Hs_locus         0.5                                          
25  trim_reads                  (0, 0, 0, 0)                                 
26  trim_loci                   (0, 0, 0, 0)                                 
27  output_formats              ['p', 's', 'v']                              
28  pop_assign_file

In [5]:

data.run("12", force=True)

Assembly: test
[####################] 100% 0:00:04 | sorting reads        | s1 |
[####################] 100% 0:00:01 | writing/compressing  | s1 |
[####################] 100% 0:00:04 | processing reads     | s2 |

In [7]:

data.run("3")

Assembly: test
[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:00:01 | join unmerged pairs  | s3 |
[####################] 100% 0:00:03 | dereplicating        | s3 |
[####################] 100% 0:00:00 | splitting dereps     | s3 |
[####################] 100% 0:00:03 | mapping reads        | s3 |
[####################] 100% 0:00:10 | building clusters    | s3 |
[####################] 100% 0:00:00 | calc cluster stats   | s3 |

In [8]:

data.run('4')

Assembly: test
[####################] 100% 0:00:06 | inferring [H, E]     | s4 |

In [9]:

data.run("5")

Assembly: test
[####################] 100% 0:00:00 | calculating depths   | s5 |
[####################] 100% 0:00:00 | chunking clusters    | s5 |
[####################] 100% 0:00:27 | consens calling      | s5 |
[####################] 100% 0:00:01 | indexing alleles     | s5 |

In [11]:

data.run("6")

Assembly: test
[####################] 100% 0:00:00 | concatenating bams   | s6 |
[####################] 100% 0:00:00 | fetching regions     | s6 |
[####################] 100% 0:00:00 | building loci        | s6 |

In [51]:

from ipyrad.assemble.write_outputs import *
bself = Step7(data, True, ipyclient)
self.split_clusters()

jobs = glob.glob(os.path.join(self.data.tmpdir, "chunk-*"))
jobs = sorted(jobs, key=lambda x: int(x.rsplit("-")[-1]))        
for jobfile in jobs:
    args = (self.data, self.chunksize, jobfile)

In [119]:

def get_edges(self, seqs):
    """
    Trim terminal edges or mask internal edges based on three criteria and
    take the max for each edge.
    1. user entered hard trimming.
    2. removing cutsite overhangs.
    3. trimming singleton-like overhangs from seqs of diff lengths.
    """
    # record whether to filter this locus based on sample coverage
    bad = False

    # 1. hard trim edges
    trim1 = np.array(self.data.paramsdict["trim_loci"])

    # 2. fuzzy match for trimming restriction site where it's expected.
    trim2 = np.array([0, 0, 0, 0])
    overhangs = np.array([
        i.encode() for i in self.data.paramsdict["restriction_overhang"]
        ])
    for pos, overhang in enumerate(overhangs):
        if overhang:
            cutter = np.array(list(overhang))
            trim2[pos] = check_cutter(seqs, pos, cutter, 0.75)

    # 3. find where the edge is not indel marked (really unknown ("N"))
    trim3 = np.array([0, 0, 0, 0])
    try:            
        minsamp = min(4, seqs.shape[0])
        # minsamp = max(minsamp, self.data.paramsdict["min_samples_locus"])
        mincovs = np.sum((seqs != 78) & (seqs != 45), axis=0)
        for pos in range(4):
            trim3[pos] = check_minsamp(seqs, pos, minsamp, mincovs)
    except ValueError:
        print('error')
        bad = True

    # get max edges
    print(trim1, trim2, trim3)
    trim = np.max([trim1, trim2, trim3], axis=0)

    # return edges as slice indices
    r1left = trim[0]
    
    # single-end simple:
    if "pair" not in self.data.paramsdict["datatype"]:
        r1right = seqs.shape[1] - trim[1]
        r2left = r2right = r1right
        edges = (r1left, r1right, r2left, r2right)
        
    else:
        r1right = 
        
        

    # get filter
    if (r1right < r1left) or (r2left < r1right) or (r2right < r2left):
        bad = True

    # if loc length is too short then filter
    if (r2right - r1left) < self.data.paramsdict["filter_min_trim_len"]:
        bad = True

    return bad, edges

In [120]:

def check_minsamp(seq, position, minsamp, mincovs):
    "used in Processor.get_edges() for trimming edges of - or N sites."
    
    if position == 0:           
        # find leftmost edge with minsamp coverage
        leftmost = np.where(mincovs >= minsamp)[0]
        if leftmost.size:
            return leftmost.min()
        
        # no sites actually have minsamp coverage although there are minsamp
        # rows of data, this can happen when reads only partially overlap. Loc
        # should be excluded for minsamp filter.
        else:
            raise ValueError("no sites above minsamp coverage in edge trim")
    
    elif position == 1:
        maxhit = np.where(mincovs >= minsamp)[0].max()
        return seq.shape[1] - (maxhit + 1)

    ## TODO...    
    elif position == 2:
        return 0
    
    else:
        return 0

In [121]:

# store list of edge trims for VCF building
edgelist = []

# todo: this could be an iterator...
with open(self.chunkfile, 'rb') as infile:
    loci = infile.read().split(b"//\n//\n")

    # iterate over loci
    for iloc, loc in enumerate(loci):                              
        # load names and seqs 
        lines = loc.decode().strip().split("\n")
        names = []
        nidxs = []
        aseqs = []
        useqs = []
        for line in lines:
            if line[0] == ">":
                name, nidx = line[1:].rsplit("_", 1)
                names.append(name)
                nidxs.append(nidx)
            else:
                aseqs.append(list(line))
                useqs.append(list(line.upper()))

        # filter to include only samples in this assembly
        mask = [i in self.data.snames for i in names]
        names = np.array(names)[mask].tolist()
        nidxs = np.array(nidxs)[mask].tolist()
        useqs = np.array(useqs)[mask, :].astype(bytes).view(np.uint8)
        aseqs = np.array(aseqs)[mask, :].astype(bytes).view(np.uint8)

        # apply filters
        efilter, edges = get_edges(self, useqs)
        print(efilter, edges)

[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 456, 456, 456)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 471, 471, 471)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 466, 466, 466)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 464, 464, 464)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 459, 459, 459)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 484, 484, 484)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 441, 441, 441)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 470, 470, 470)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 486, 486, 486)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 465, 465, 465)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 466, 466, 466)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 473, 473, 473)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 441, 441, 441)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 438, 438, 438)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 464, 464, 464)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 482, 482, 482)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 443, 443, 443)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 469, 469, 469)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 456, 456, 456)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 480, 480, 480)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 463, 463, 463)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 438, 438, 438)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 443, 443, 443)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 454, 454, 454)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 467, 467, 467)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 451, 451, 451)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 472, 472, 472)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 476, 476, 476)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 439, 439, 439)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 457, 457, 457)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 475, 475, 475)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 448, 448, 448)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 440, 440, 440)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 460, 460, 460)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 469, 469, 469)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 473, 473, 473)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 444, 444, 444)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 449, 449, 449)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 484, 484, 484)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 486, 486, 486)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 467, 467, 467)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 479, 479, 479)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 476, 476, 476)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 446, 446, 446)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 452, 452, 452)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 478, 478, 478)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 450, 450, 450)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 458, 458, 458)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 452, 452, 452)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 451, 451, 451)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 459, 459, 459)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 443, 443, 443)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 484, 484, 484)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 453, 453, 453)
[0 0 0 0] [5 3 0 0] [0 0 0 0]
False (5, 442, 442, 442)

In [54]:

data = self.data
chunksize = self.chunksize
chunkfile = jobs[0]

self = Processor(data, chunksize, chunkfile)

In [23]:

self.remote_process_chunks()
self.collect_stats()
self.store_file_handles()

[####################] 100% 0:00:01 | applying filters     | s7 |

In [24]:

start = time.time()
printstr = ("building arrays     ", "s7")
rasyncs = {}
args0 = (self.data,)
args1 = (self.data, self.ntaxa, self.nbases, self.nloci)
args2 = (self.data, self.ntaxa, self.nsnps)

write_loci_and_alleles(*args0)

In [25]:

fill_seq_array(*args1)

In [49]:

data = self.data
ntaxa = self.ntaxa
nsnps = self.nsnps

# get faidict to convert chroms to ints
if data.isref:
    faidict = chroms2ints(data, True)

# open new database file handle
with h5py.File(data.snps_database, 'w') as io5:

    # Database files for storing arrays on disk. 
    # Should optimize for slicing by rows if we run into slow writing, or 
    # it uses too much mem. For now letting h5py to auto-chunking.
    io5.create_dataset(
        name="snps",
        shape=(ntaxa, nsnps),
        dtype=np.uint8,
    )
    # store snp locations:
    # (loc-counter, loc-snp-counter, loc-snp-pos, chrom, chrom-snp-pos)
    io5.create_dataset(
        name="snpsmap",
        shape=(nsnps, 5),
        dtype=np.uint32,
    )
    # store snp locations
    io5.create_dataset(
        name="pseudoref",
        shape=(nsnps, 4),
        dtype=np.uint8,
    )
    # store genotype calls (0/0, 0/1, 0/2, etc.)
    io5.create_dataset(
        name="genos",
        shape=(nsnps, ntaxa, 2),
        dtype=np.uint8,
    )

    # gather all loci bits
    locibits = glob.glob(os.path.join(data.tmpdir, "*.loci"))
    sortbits = sorted(locibits, 
        key=lambda x: int(x.rsplit("-", 1)[-1][:-5]))

    # name order for entry in array
    sidxs = {sample: i for (i, sample) in enumerate(data.snames)}

    # iterate through file
    start = end = 0
    tmploc = {}
    locidx = 1
    snpidx = 1   
    
    # array to store until writing
    tmparr = np.zeros((ntaxa, nsnps), dtype=np.uint8)
    tmpmap = np.zeros((nsnps, 5), dtype=np.uint32)

    # iterate over chunkfiles
    for bit in sortbits:
        # iterate lines of file until locus endings
        for line in iter(open(bit, 'r')):

            # still filling locus until |\n
            if "|\n" not in line:
                name, seq = line.split()
                tmploc[name] = seq

            # locus is full, dump it
            else:
                # convert seqs to an array
                loc = (
                    np.array([list(i) for i in tmploc.values()])
                    .astype(bytes).view(np.uint8)
                    )
                snps, idxs, _ = line[len(data.snppad):].rsplit("|", 2)
                snpsmask = np.array(list(snps)) != " "
                snpsidx = np.where(snpsmask)[0]

                # select only the SNP sites
                snpsites = loc[:, snpsmask]

                # store end position of locus for map
                end = start + snpsites.shape[1]
                print(start, end)
                
                for idx, name in enumerate(tmploc):
                    tmparr[sidxs[name], start:end] = snpsites[idx, :]

                # store snpsmap data 1-indexed with chroms info
                if data.isref:
                    chrom, pos = idxs.split(",")[0].split(":")
                    start = int(pos.split("-")[0])
                    #chromidx = faidict[chrom]
                    chromidx = int(chrom)
                    for isnp in range(snpsites.shape[1]):
                        isnpx = snpsidx[isnp]
                        tmpmap[snpidx - 1] = (
                            locidx, isnp, isnpx, chromidx, isnpx + start,
                        )
                        snpidx += 1
                # store snpsmap data (snpidx is 1-indexed)
                else:
                    for isnp in range(snpsites.shape[1]):
                        tmpmap[snpidx - 1] = (
                            locidx, isnp, snpsidx[isnp], 0, snpidx,
                        )
                        snpidx += 1
                locidx += 1

                # reset locus
                start = end
                tmploc = {}

    # fill missing with 78 (N)
    tmparr[tmparr == 0] = 78

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-49-3630a9f0177d> in <module>()
     85 
     86                 for idx, name in enumerate(tmploc):
---> 87                     tmparr[sidxs[name], start:end] = snpsites[idx, :]
     88 
     89                 # store snpsmap data 1-indexed with chroms info

ValueError: could not broadcast input array from shape (12) into shape (7)

In [48]:

print(tmparr.shape)


print(start, end)
tmparr[sidxs[name], start:end]

(13, 2474)
2467 2479

Out[48]:

array([0, 0, 0, 0, 0, 0, 0], dtype=uint8)

In [21]:

fill_snp_array(*args2)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-f157821ffeb6> in <module>()
----> 1 fill_snp_array(*args2)

~/Documents/ipyrad/ipyrad/assemble/write_outputs.py in fill_snp_array(data, ntaxa, nsnps)
   1585                     end = start + snpsites.shape[1]
   1586                     for idx, name in enumerate(tmploc):
-> 1587                         tmparr[sidxs[name], start:end] = snpsites[idx, :]
   1588 
   1589                     # store snpsmap data 1-indexed with chroms info

ValueError: could not broadcast input array from shape (12) into shape (7)

In [12]:

data.run("7")

Assembly: test
[####################] 100% 0:00:05 | applying filters     | s7 |
[####################] 100% 0:00:01 | building arrays      | s7 |

Encountered an unexpected error:
ValueError(could not broadcast input array from shape (12) into shape (7))

---------------------------------------------------------------------------ValueError                                Traceback (most recent call last)<string> in <module>()
~/Documents/ipyrad/ipyrad/assemble/write_outputs.py in fill_snp_array(data, ntaxa, nsnps)
   1585                     end = start + snpsites.shape[1]
   1586                     for idx, name in enumerate(tmploc):
-> 1587                         tmparr[sidxs[name], start:end] = snpsites[idx, :]
   1588 
   1589                     # store snpsmap data 1-indexed with chroms info
ValueError: could not broadcast input array from shape (12) into shape (7)

In [ ]:

In [5]:

data.run("3")

Assembly: test
[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:00:01 | join unmerged pairs  | s3 |
[####################] 100% 0:00:00 | dereplicating        | s3 |
[####################] 100% 0:00:00 | splitting dereps     | s3 |
[####################] 100% 0:00:02 | mapping reads        | s3 |
[####################] 100% 0:00:09 | building clusters    | s3 |
[####################] 100% 0:00:00 | calc cluster stats   | s3 |

In [6]:

data.run("456")

Assembly: test
[####################] 100% 0:00:02 | inferring [H, E]     | s4 |
[####################] 100% 0:00:00 | calculating depths   | s5 |
[####################] 100% 0:00:00 | chunking clusters    | s5 |
[####################] 100% 0:00:24 | consens calling      | s5 |
[####################] 100% 0:00:01 | indexing alleles     | s5 |
[####################] 100% 0:00:00 | concatenating bams   | s6 |
[####################] 100% 0:00:00 | building clusters    | s6 |

  Encountered an unexpected error (see ./ipyrad_log.txt)
  Error message is below -------------------------------
name 'data' is not defined

In [11]:

data.run("1", force=True)

Assembly: test
[force] overwriting fastq files previously created by ipyrad.
This _does not_ affect your original/raw data files.
[####################] 100% 0:00:03 | sorting reads        | s1 |
[####################] 100% 0:00:00 | writing/compressing  | s1 |

In [9]:

data = ip.load_json("tortas/5-tortas.json")

loading Assembly: 5-tortas
from saved path: ~/Documents/ipyrad/tests/tortas/5-tortas.json

In [18]:

d2 = data.branch("d2", subsamples=["AGO09concat"])

In [19]:

d2.run("3", force=True)

Assembly: d2
[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:02:44 | join unmerged pairs  | s3 |
[####################] 100% 0:01:14 | dereplicating        | s3 |
[####################] 100% 0:00:18 | splitting dereps     | s3 |
[####################] 100% 0:52:29 | mapping reads        | s3 |
[####################] 100% 0:55:43 | building clusters    | s3 |
[####################] 100% 0:00:13 | calc cluster stats   | s3 |

In [20]:

d2.run("45", force=True)

Assembly: d2
[####################] 100% 0:00:21 | inferring [H, E]     | s4 |
[####################] 100% 0:00:13 | calculating depths   | s5 |
[####################] 100% 0:00:22 | chunking clusters    | s5 |
[####################] 100% 15:27:41 | consens calling      | s5 |
[####################] 100% 0:00:41 | indexing alleles     | s5 |

In [7]:

d2 = ip.load_json("./tortas/d2.json")

loading Assembly: d2
from saved path: ~/Documents/ipyrad/tests/tortas/d2.json

In [9]:

d2.stats

Out[9]:

	state	reads_raw	reads_passed_filter	refseq_mapped_reads	refseq_unmapped_reads	clusters_total	clusters_hidepth	hetero_est	error_est	reads_consens
AGO09concat	5	15650127	15121047	8307851	6813196	1364279	381081	0.001636	0.001511	380396

In [11]:

self = Step5(data, True, ipyclient)

In [12]:

self.remote_calculate_depths()

[####################] 100% 0:00:17 | calculating depths   | s5 |

In [13]:

self.remote_make_chunks()

[####################] 100% 0:00:28 | chunking clusters    | s5 |

In [14]:

statsdicts = self.remote_process_chunks()

[####################] 100% 1:07:30 | consens calling      | s5 |

In [15]:

statsdicts

Out[15]:

{'AGO08concat': [({'name': 90642,
    'heteros': 15544,
    'nsites': 22873090,
    'nconsens': 90642},
   {'depth': 227971, 'maxh': 30, 'maxn': 2}),
  ({'name': 406474, 'heteros': 17080, 'nsites': 22034591, 'nconsens': 87829},
   {'depth': 230745, 'maxh': 69, 'maxn': 2}),
  ({'name': 725373, 'heteros': 17170, 'nsites': 22095933, 'nconsens': 88083},
   {'depth': 230497, 'maxh': 62, 'maxn': 3}),
  ({'name': 1044454, 'heteros': 28259, 'nsites': 22420201, 'nconsens': 88519},
   {'depth': 229823, 'maxh': 301, 'maxn': 2})],
 'AGO11concat': [({'name': 88491,
    'heteros': 15532,
    'nsites': 17265948,
    'nconsens': 88491},
   {'depth': 144187, 'maxh': 31, 'maxn': 0}),
  ({'name': 319207, 'heteros': 16819, 'nsites': 16849885, 'nconsens': 86498},
   {'depth': 146149, 'maxh': 62, 'maxn': 0}),
  ({'name': 552196, 'heteros': 16745, 'nsites': 16895490, 'nconsens': 86778},
   {'depth': 145870, 'maxh': 61, 'maxn': 0}),
  ({'name': 785074, 'heteros': 26253, 'nsites': 17128785, 'nconsens': 86947},
   {'depth': 145382, 'maxh': 374, 'maxn': 0})],
 'AGO02concat': [({'name': 79397,
    'heteros': 14200,
    'nsites': 16889465,
    'nconsens': 79397},
   {'depth': 189044, 'maxh': 24, 'maxn': 0}),
  ({'name': 345488, 'heteros': 15332, 'nsites': 16378730, 'nconsens': 77023},
   {'depth': 191394, 'maxh': 48, 'maxn': 0}),
  ({'name': 614298, 'heteros': 15677, 'nsites': 16486755, 'nconsens': 77368},
   {'depth': 191048, 'maxh': 49, 'maxn': 0}),
  ({'name': 883384, 'heteros': 24439, 'nsites': 16840082, 'nconsens': 77989},
   {'depth': 190103, 'maxh': 370, 'maxn': 0})],
 'AGO09concat': [({'name': 1, 'heteros': 0, 'nsites': 147, 'nconsens': 1},
   {'depth': 9, 'maxh': 0, 'maxn': 0})]}

In [ ]:

self.remote_concatenate_chunks()

In [ ]:

self.data_store(statsdicts)

In [10]:

data.stats

Out[10]:

	state	reads_raw	reads_passed_filter	refseq_mapped_reads	refseq_unmapped_reads	clusters_total	clusters_hidepth	hetero_est	error_est	reads_consens
AGO02concat	5	11050294	10800672	7210402	3590270	1073857	312271	0.002455	0.001599	310509
AGO08concat	5	13408401	13030329	7245593	5784736	1274580	355547	0.002825	0.001497	353139
AGO09concat	5	15650127	15121047	8307851	6813196	1364279	1	0.002711	0.001882	378504
AGO11concat	5	12848936	12370018	7855116	4514902	930830	349245	0.002108	0.001574	347812

In [5]:

dd = data.branch("dd")
dd.run("5", force=True)

Assembly: dd
[####################] 100% 0:00:17 | calculating depths   | s5 |
[####################] 100% 0:00:31 | chunking clusters    | s5 |
[####################] 100% 1 day, 1:41:43 | consens calling      | s5 |
[####################] 100% 0:00:18 | indexing alleles     | s5 |

  Encountered an error (see details in ./ipyrad_log.txt)
  Error summary is below -------------------------------
KeyboardInterrupt()

In [6]:

dd.run("3", force=True)

Out[6]:

	state	reads_raw	reads_passed_filter	refseq_mapped_reads	refseq_unmapped_reads	clusters_total	clusters_hidepth	hetero_est	error_est	reads_consens
AGO02concat	4	11050294	10800672	7210402	3590270	1073857	312271	0.001542	0.001308	310509
AGO08concat	4	13408401	13030329	7245593	5784736	1274580	355547	0.001910	0.001097	353139
AGO09concat	4	15650127	15121047	8307851	6813196	1364279	1	0.000042	0.002260	378504
AGO11concat	4	12848936	12370018	7855116	4514902	930830	349245	0.001330	0.001307	347812

In [4]:

data.run("5", force=True)

Assembly: 5-tortas
[####################] 100% 0:00:16 | calculating depths   | s5 |
[####################] 100% 0:00:29 | chunking clusters    | s5 |
[####################] 100% 1:04:59 | consens calling      | s5 |
[####################] 100% 0:00:59 | indexing alleles     | s5 |

  Encountered an error (see details in ./ipyrad_log.txt)
  Error summary is below -------------------------------
IPyradError(error in samtools: b'[E::sam_parse1] CIGAR and query sequence are of different length\n[W::sam_read1] parse error at line 10786\n[main_samview] truncated file.\n')

In [5]:

step = Step6(data, True, ipyclient)

In [6]:

step.remote_concat_bams()

[####################] 100% 0:00:00 | concatenating bams   | s6 |

In [7]:

step.remote_build_ref_regions()

[####################] 100% 0:00:00 | building clusters    | s6 |

In [8]:

self = step

In [9]:

regs = self.regions[:20]
regs

Out[9]:

[('MT', 5008, 5467),
 ('MT', 10476, 10950),
 ('MT', 15959, 16428),
 ('MT', 21437, 21918),
 ('MT', 26927, 27394),
 ('MT', 32403, 32865),
 ('MT', 37874, 38361),
 ('MT', 43370, 43814),
 ('MT', 48823, 49296),
 ('MT', 54305, 54794),
 ('MT', 59803, 60248),
 ('MT', 65257, 65725),
 ('MT', 70734, 71215),
 ('MT', 76224, 76680),
 ('MT', 81689, 82145),
 ('MT', 87154, 87623),
 ('MT', 92632, 93108),
 ('MT', 98117, 98561),
 ('MT', 103570, 104011),
 ('MT', 109020, 109487)]

In [11]:

# access reads from bam file using pysam
bamfile = AlignmentFile(
    os.path.join(
        self.data.dirs.across,
        "cat.sorted.bam"),
    'rb')

# catcons output file for raw clusters and db samples
outfile = gzip.open(
    os.path.join(
        self.data.dirs.across,
        "{}.catcons.gz".format(self.data.name)),
    'wb')

# write header line to catcons with sample names
snames = sorted([i.name for i in self.samples])
nsamples = len(snames)
outfile.write(
    b" ".join([i.encode() for i in snames]) + b"\n")

# get clusters
lidx = 0
clusts = []
# while 1:

#     try:
#         region = next(self.regions)
#         reads = bamfile.fetch(*region)
#     except StopIteration:
#         break

for region in regs:
    reads = bamfile.fetch(*region)
 
    # get reference
    print(region)
    refn, refs = get_ref_region(
        data.paramsdict["reference_sequence"], 
        region[0], region[1]+1, region[2]+1)  
    
    # build cluster dict for sorting                
    rdict = {}
    for read in reads:
        rdict[read.qname] = read.seq   
    keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
    
    # build cluster based on map positions (reads are already oriented)
    arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)
    
    # fill it
    arr[0] = list(refs)
    for idx, key in enumerate(keys):
        # get start and stop from this seq
        sname = key.rsplit("_", 1)[0]
        rstart, rstop = key.rsplit(":", 2)[-1].split("-")
        sidx = snames.index(sname)
        
        # get start relative to ref
        start = int(rstart) - int(region[1]) - 1
        stop = start + int(rstop) - int(rstart)
        print(sidx + 1, start, stop, arr.shape[1])
        arr[sidx + 1, int(start): int(stop)] = list(rdict[key])
        
    print("")
    arr[arr == b""] = b"N"
    for line in arr:
        outfile.write(line.tostring() + b"\n")
    outfile.write(b"\n")
    
        
outfile.close()

('MT', 5008, 5467)
4 0 459 459
5 0 459 459
7 0 459 459
8 0 459 459
11 0 459 459

('MT', 10476, 10950)
1 0 474 474
3 0 474 474
5 0 474 474
7 0 474 474
8 0 474 474
11 0 474 474

('MT', 15959, 16428)
1 0 469 469
4 0 469 469
5 0 469 469
7 0 469 469
8 0 469 469
9 0 469 469
11 0 469 469

('MT', 21437, 21918)
1 0 481 481
2 0 481 481
3 0 481 481
5 0 481 481
6 0 481 481
7 0 481 481
8 0 481 481
11 0 481 481
12 0 481 481

('MT', 26927, 27394)
1 0 467 467
2 0 467 467
3 0 467 467
4 0 467 467
6 0 467 467
7 0 467 467
8 0 467 467
10 0 467 467
11 0 467 467
12 0 467 467

('MT', 32403, 32865)
2 0 462 462
3 0 462 462
4 0 462 462
5 0 462 462
6 0 462 462
7 0 462 462
8 0 462 462
9 0 462 462
10 0 462 462
11 0 462 462
12 0 462 462

('MT', 37874, 38361)
3 0 487 487
4 0 487 487
5 0 487 487
7 0 487 487
9 0 487 487
10 0 487 487

('MT', 43370, 43814)
1 0 444 444
2 0 444 444
3 0 444 444
4 0 444 444
6 0 444 444
7 0 444 444
8 0 444 444
9 0 444 444
11 0 444 444

('MT', 48823, 49296)
1 0 473 473
2 0 473 473
3 0 473 473
4 0 473 473
5 0 473 473
6 0 473 473
7 0 473 473
9 0 473 473
10 0 473 473
11 0 473 473

('MT', 54305, 54794)
1 0 489 489
2 0 489 489
3 0 489 489
4 0 489 489
6 0 489 489
7 0 489 489
8 0 489 489
9 0 489 489
10 0 489 489
12 0 489 489

('MT', 59803, 60248)
2 0 445 445
3 0 445 445
4 0 445 445
6 0 445 445
7 0 445 445
8 0 445 445
10 0 445 445
11 0 445 445

('MT', 65257, 65725)
1 0 468 468
2 0 468 468
3 0 468 468
4 0 468 468
5 0 468 468
6 0 468 468
8 0 468 468
11 0 468 468

('MT', 70734, 71215)
2 0 481 481
4 0 481 481
5 0 481 481
7 0 481 481
9 0 481 481
10 0 481 481
11 0 481 481
12 0 481 481

('MT', 76224, 76680)
1 0 456 456
2 0 456 456
3 0 456 456
4 0 456 456
5 0 456 456
6 0 456 456
7 0 456 456
8 0 456 456
10 0 456 456
11 0 456 456
12 0 456 456

('MT', 81689, 82145)
1 0 456 456
2 0 456 456
3 0 456 456
4 0 456 456
6 0 456 456
7 0 456 456
8 0 456 456
10 0 456 456
12 0 456 456

('MT', 87154, 87623)
1 0 469 469
2 0 469 469
3 0 469 469
4 0 469 469
5 0 469 469
6 0 469 469
7 0 469 469
9 0 469 469
10 0 469 469
11 0 469 469
12 0 469 469

('MT', 92632, 93108)
1 0 476 476
3 0 476 476
4 0 476 476
5 0 476 476
7 0 476 476
9 0 476 476
10 0 476 476
11 0 476 476
12 0 476 476

('MT', 98117, 98561)
2 0 444 444
3 0 444 444
5 0 444 444
6 0 444 444
8 0 444 444
9 0 444 444
11 0 444 444

('MT', 103570, 104011)
1 0 441 441
2 0 441 441
4 0 441 441
5 0 441 441
6 0 441 441
7 0 441 441
8 0 441 441
9 0 441 441
10 0 441 441
11 0 441 441

('MT', 109020, 109487)
1 0 467 467
2 0 467 467
3 0 467 467
4 0 467 467
6 0 467 467
7 0 467 467
8 0 467 467
9 0 467 467
10 0 467 467
11 0 467 467
12 0 467 467

In [50]:

print(start, stop, stop-start, len(rdict[key]), rstart, rstop, int(rstop) - int(rstart))
print(region, region[2] - region[1], len(refs))

300 559 259 559 41920 42479 559
('Contig0', 41619, 42478) 859 859

In [61]:

key.split("")

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-61-55088ce8a832> in <module>()
----> 1 key.split("")

ValueError: empty separator

In [62]:

snames

Out[62]:

['AGO02concat', 'AGO08concat', 'AGO09concat', 'AGO11concat']

In [141]:

revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")

Out[141]:

'NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCATGGCTTGTATTAGAGATGGGGCTGAAGTCAAACCCTGACCTGAACAGTGAGAAAATCTGGTGCCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTCCTCTGCCTCCCTCAAGAAGAAATCAAGAGKGGAAAGGAGCAGGGCGGGAGGTATGGGAAAGGAAGAATGGAATT'

In [37]:

    # get consens seq and variant site index 
    clust = []
    avars = refvars(arr.view(np.uint8), PSEUDO_REF)
    dat = b"".join(avars.view("S1")[:, 0]).decode()
    snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
    clust.append("ref_{}:{}-{}\n{}".format(*region, dat))

    # or, just build variant string (or don't...)
    # write all loci with [locids, nsnps, npis, nindels, ?]
    for key in keys:
        clust.append("{}\n{}".format(key, rdict[key]))
    clust.append("SNPs\n" + snpstring)
    clusts.append("\n".join(clust))

    # advance locus counter
    lidx += 1

    # write chunks to file
    if not lidx % 1000:
        outfile.write(
            str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
        clusts = []

# write remaining
if clusts:                
    outfile.write(
        str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-37-7858eebe5047> in <module>()
     43     arr = np.zeros((len(rdict), len(refs)), dtype=bytes)
     44     for idx, key in enumerate(keys):
---> 45         arr[idx] = list(rdict[key])
     46 
     47     # get consens seq and variant site index

ValueError: cannot copy sequence with size 231 to array axis with dimension 535

In [16]:

step.remote_build_ref_clusters()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-16-2a0e5422841e> in <module>()
----> 1 step.remote_build_ref_clusters()

~/Documents/ipyrad/ipyrad/assemble/cluster_across.py in remote_build_ref_clusters(self)
    588             arr = np.zeros((len(rdict), len(read.seq)), dtype=bytes)
    589             for idx, key in enumerate(keys):
--> 590                 arr[idx] = list(rdict[key])
    591 
    592             # get consens seq and variant site index

ValueError: cannot copy sequence with size 543 to array axis with dimension 559

In [4]:

from ipyrad.assemble.clustmap import *

In [6]:

self = Step3(data, 8, True, ipyclient)

In [7]:

self.run()

[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:05:42 | join unmerged pairs  | s3 |
[####################] 100% 0:02:17 | dereplicating        | s3 |
[####################] 100% 0:00:39 | splitting dereps     | s3 |
[####################] 100% 1:53:03 | mapping reads        | s3 |
[####################] 100% 1:40:12 | building clusters    | s3 |
[####################] 100% 0:00:24 | calc cluster stats   | s3 |

In [8]:

self.data.run("45")

Assembly: 5-tortas
[####################] 100% 0:15:41 | inferring [H, E]     | s4 |
[####################] 100% 0:00:26 | calculating depths   | s5 |
[####################] 100% 0:00:45 | chunking clusters    | s5 |
[####################] 100% 16:10:59 | consens calling      | s5 |
[####################] 100% 0:01:31 | indexing alleles     | s5 |

In [13]:

self.remote_index_refs()
self.remote_run(
    function=concat_multiple_edits,
    printstr=("concatenating       ", "s3"),
    args=(),
)
self.remote_run(
    function=merge_end_to_end,
    printstr=("join unmerged pairs ", "s3"),
    args=(False, False,),
)

[####################] 100% 0:00:00 | indexing reference   | s3 |
[####################] 100% 0:00:00 | concatenating        | s3 |
[####################] 100% 0:05:49 | join unmerged pairs  | s3 |

In [15]:

self.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)

[####################] 100% 0:02:22 | dereplicating        | s3 |

In [16]:

self.remote_run(
    function=split_endtoend_reads,
    printstr=("splitting dereps    ", "s3"),
    args=(),
)

[####################] 100% 0:00:36 | splitting dereps     | s3 |

In [ ]:

self.remote_run(
    function=mapping_reads,
    printstr=("mapping reads       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)

[##########          ]  50% 1:18:01 | mapping reads        | s3 |

In [ ]:

sample = list(self.data.samples.values())[0]
merge_end_to_end(self.data, sample, True, True)

In [ ]:

In [15]:

sample = list(data.samples.values())[0]

infiles = [
    os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
    os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
    os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
    os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]

infile

Out[15]:

'/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq'

In [19]:

strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
    strand = "both"
nthreads=2

cmd = [
    ip.bins.vsearch,
    "--derep_fulllength", infile,
    "--strand", strand,
    "--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
    "--threads", str(nthreads),
    "--fasta_width", str(0),
    "--fastq_qmax", "1000",
    "--sizeout", 
    "--relabel_md5",
]
cmd

Out[19]:

['/home/deren/Documents/ipyrad/bin/vsearch-linux-x86_64',
 '--derep_fulllength',
 '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_merged.fastq',
 '--strand',
 'plus',
 '--output',
 '/home/deren/Documents/ipyrad/tests/tortas/5-tortas-tmpalign/AGO02concat_derep.fastq',
 '--threads',
 '2',
 '--fasta_width',
 '0',
 '--fastq_qmax',
 '1000',
 '--sizeout',
 '--relabel_md5']

In [20]:

proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
    ip.logger.error("error inside dereplicate %s", errmsg)
    raise IPyradWarningExit(errmsg)

In [10]:

s.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(s.nthreads,),
    threaded=True,
)

[                    ]   0% 0:01:55 | dereplicating        | s3 |

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-10-9074a628ea2e> in <module>()
      3     printstr=("dereplicating       ", "s3"),
      4     args=(s.nthreads,),
----> 5     threaded=True,
      6 )

~/Documents/ipyrad/ipyrad/assemble/clustmap.py in remote_run(self, printstr, function, args, threaded)
    598             ready = [rasyncs[i].ready() for i in rasyncs]
    599             self.data._progressbar(len(ready), sum(ready), start, printstr)
--> 600             time.sleep(0.1)
    601             if len(ready) == sum(ready):
    602                 break

KeyboardInterrupt: