ipyrad testing for pairddrad data

In [1]:
import ipyrad as ip      ## for RADseq assembly
print ip.__version__     ## print version
0.1.37
In [2]:
## clear existing test_dir/
import shutil
import os
if os.path.exists("./test_pairddrad/"):
    shutil.rmtree("./test_pairddrad/")
In [7]:
data1 = ip.Assembly('data1')
  New Assembly: data1
In [8]:
data1.get_params()
  0   assembly_name               data1                                        
  1   project_dir                 .                                            
  2   raw_fastq_path              ./*.fastq                                    
  3   barcodes_path               ./*.barcodes.txt                             
  4   sorted_fastq_path                                                        
  5   assembly_method             denovo                                       
  6   reference_sequence                                                       
  7   datatype                    rad                                          
  8   restriction_overhang        ('TGCAG', '')                                
  9   max_low_qual_bases          5                                            
  10  phred_Qscore_offset         33                                           
  11  mindepth_statistical        6                                            
  12  mindepth_majrule            6                                            
  13  maxdepth                    1000                                         
  14  clust_threshold             0.85                                         
  15  max_barcode_mismatch        1                                            
  16  filter_adapters             0                                            
  17  filter_min_trim_len         35                                           
  18  max_alleles_consens         2                                            
  19  max_Ns_consens              (5, 5)                                       
  20  max_Hs_consens              (8, 8)                                       
  21  min_samples_locus           4                                            
  22  max_SNPs_locus              (100, 100)                                   
  23  max_Indels_locus            (5, 99)                                      
  24  max_shared_Hs_locus         0.25                                         
  25  edit_cutsites               (0, 0)                                       
  26  trim_overhang               (1, 2, 2, 1)                                 
  27  output_formats              *                                            
  28  pop_assign_file                                                          
  29  excludes                                                                 
  30  outgroups                                                                
In [11]:
data1.set_params('project_dir', "./test_pairddrad")
data1.set_params('raw_fastq_path', "./data/sim_pairddrad_*.gz")
data1.set_params('barcodes_path', "./data/sim_pairddrad_barcodes.txt")
data1.set_params('restriction_overhang', ("TGCAG", "AATT"))
data1.set_params('datatype', 'pairddrad')
#data1.set_params(17, 1)

data1.get_params()
  0   assembly_name               data1                                        
  1   project_dir                 ./test_pairddrad                             
  2   raw_fastq_path              ./data/sim_pairddrad_*.gz                    
  3   barcodes_path               ./data/sim_pairddrad_barcodes.txt            
  4   sorted_fastq_path                                                        
  5   assembly_method             denovo                                       
  6   reference_sequence                                                       
  7   datatype                    pairddrad                                    
  8   restriction_overhang        ('TGCAG', 'AATT')                            
  9   max_low_qual_bases          5                                            
  10  phred_Qscore_offset         33                                           
  11  mindepth_statistical        6                                            
  12  mindepth_majrule            6                                            
  13  maxdepth                    1000                                         
  14  clust_threshold             0.85                                         
  15  max_barcode_mismatch        1                                            
  16  filter_adapters             0                                            
  17  filter_min_trim_len         35                                           
  18  max_alleles_consens         2                                            
  19  max_Ns_consens              (5, 5)                                       
  20  max_Hs_consens              (8, 8)                                       
  21  min_samples_locus           4                                            
  22  max_SNPs_locus              (100, 100)                                   
  23  max_Indels_locus            (5, 99)                                      
  24  max_shared_Hs_locus         0.25                                         
  25  edit_cutsites               (0, 0)                                       
  26  trim_overhang               (1, 2, 2, 1)                                 
  27  output_formats              *                                            
  28  pop_assign_file                                                          
  29  excludes                                                                 
  30  outgroups                                                                
In [ ]:
#data1.link_fastqs(path="test_pairddrad/test_pairddrad_fastqs/", append=True)
In [12]:
data1.step1()
print data1.stats
  Saving current assembly.
     state  reads_raw
1A0      1      20000
1B0      1      20000
1C0      1      20000
1D0      1      20000
2E0      1      20000
2F0      1      20000
2G0      1      20000
2H0      1      20000
3I0      1      20000
3J0      1      20000
3K0      1      20000
3L0      1      20000
In [13]:
data1.step2()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
  Saving current assembly.
     state  reads_raw  reads_filtered
1A0      2      20000           20000
1B0      2      20000           20000
1C0      2      20000           20000
1D0      2      20000           20000
2E0      2      20000           20000
2F0      2      20000           20000
2G0      2      20000           20000
2H0      2      20000           20000
3I0      2      20000           20000
3J0      2      20000           20000
3K0      2      20000           20000
3L0      2      20000           20000
In [14]:
# data1.step3()                        ## do all samples
# data1.step3("1A0")                   ## do one sample
# data1.step3(["1A0", "1B0", "1C0"])   ## do list of samples
data1.step3()#["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats
  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth
1A0      3      20000           20000            1000              1000
1B0      3      20000           20000            1000              1000
1C0      3      20000           20000            1000              1000
1D0      3      20000           20000            1000              1000
2E0      3      20000           20000            1000              1000
2F0      3      20000           20000            1000              1000
2G0      3      20000           20000            1000              1000
2H0      3      20000           20000            1000              1000
3I0      3      20000           20000            1000              1000
3J0      3      20000           20000            1000              1000
3K0      3      20000           20000            1000              1000
3L0      3      20000           20000            1000              1000
In [16]:
%%time
data1.step4()#["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats
    skipping 1B0; already estimated. Use force=True to overwrite.
    skipping 2H0; already estimated. Use force=True to overwrite.
    skipping 3J0; already estimated. Use force=True to overwrite.
    skipping 3K0; already estimated. Use force=True to overwrite.
  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \
1A0      4      20000           20000            1000              1000   
1B0      4      20000           20000            1000              1000   
1C0      4      20000           20000            1000              1000   
1D0      4      20000           20000            1000              1000   
2E0      4      20000           20000            1000              1000   
2F0      4      20000           20000            1000              1000   
2G0      4      20000           20000            1000              1000   
2H0      4      20000           20000            1000              1000   
3I0      4      20000           20000            1000              1000   
3J0      4      20000           20000            1000              1000   
3K0      4      20000           20000            1000              1000   
3L0      4      20000           20000            1000              1000   

     hetero_est  error_est  
1A0    0.001308   0.000488  
1B0    0.001422   0.000494  
1C0    0.001427   0.000482  
1D0    0.001286   0.000497  
2E0    0.001226   0.000491  
2F0    0.001406   0.000479  
2G0    0.001497   0.000506  
2H0    0.001460   0.000502  
3I0    0.001432   0.000504  
3J0    0.001464   0.000483  
3K0    0.001412   0.000502  
3L0    0.001282   0.000470  
CPU times: user 331 ms, sys: 4.28 ms, total: 336 ms
Wall time: 7.07 s
In [1]:
#data1.save("upto2")
## still figuring out how best to save...
import ipyrad as ip
data1 = ip.load_assembly("test_pairddrad/test_pairddrad.assembly")
DEBUG:ipyrad:H4CKERZ-mode: __loglevel__ = DEBUG
INFO:ipyrad.core.parallel:Local connection to 4 engines [ipyrad-2480]
Loading Assembly: test_pairddrad  [test_pairddrad/test_pairddrad.assembly]
ipyparallel setup: Local connection to 4 engines

In [17]:
data1.step5()#["1B0"], force=True) 
print data1.stats
  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \
1A0      5      20000           20000            1000              1000   
1B0      5      20000           20000            1000              1000   
1C0      5      20000           20000            1000              1000   
1D0      5      20000           20000            1000              1000   
2E0      5      20000           20000            1000              1000   
2F0      5      20000           20000            1000              1000   
2G0      5      20000           20000            1000              1000   
2H0      5      20000           20000            1000              1000   
3I0      5      20000           20000            1000              1000   
3J0      5      20000           20000            1000              1000   
3K0      5      20000           20000            1000              1000   
3L0      5      20000           20000            1000              1000   

     hetero_est  error_est  reads_consens  
1A0    0.001308   0.000488           1000  
1B0    0.001422   0.000494           1000  
1C0    0.001427   0.000482           1000  
1D0    0.001286   0.000497           1000  
2E0    0.001226   0.000491           1000  
2F0    0.001406   0.000479           1000  
2G0    0.001497   0.000506           1000  
2H0    0.001460   0.000502           1000  
3I0    0.001432   0.000504           1000  
3J0    0.001464   0.000483           1000  
3K0    0.001412   0.000502           1000  
3L0    0.001282   0.000470           1000  
In [ ]:
data1.step5(["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats
In [18]:
data1.step6()#(["1B0", "2H0", "3J0", "3K0"], force=True) 
print data1.stats
  Saving current assembly.
     state  reads_raw  reads_filtered  clusters_total  clusters_hidepth  \
1A0      6      20000           20000            1000              1000   
1B0      6      20000           20000            1000              1000   
1C0      6      20000           20000            1000              1000   
1D0      6      20000           20000            1000              1000   
2E0      6      20000           20000            1000              1000   
2F0      6      20000           20000            1000              1000   
2G0      6      20000           20000            1000              1000   
2H0      6      20000           20000            1000              1000   
3I0      6      20000           20000            1000              1000   
3J0      6      20000           20000            1000              1000   
3K0      6      20000           20000            1000              1000   
3L0      6      20000           20000            1000              1000   

     hetero_est  error_est  reads_consens  
1A0    0.001308   0.000488           1000  
1B0    0.001422   0.000494           1000  
1C0    0.001427   0.000482           1000  
1D0    0.001286   0.000497           1000  
2E0    0.001226   0.000491           1000  
2F0    0.001406   0.000479           1000  
2G0    0.001497   0.000506           1000  
2H0    0.001460   0.000502           1000  
3I0    0.001432   0.000504           1000  
3J0    0.001464   0.000483           1000  
3K0    0.001412   0.000502           1000  
3L0    0.001282   0.000470           1000  
In [19]:
data1.step7()
inc ['1B0', '2G0', '1C0', '1A0', '2H0', '2E0', '3L0', '3I0', '1D0', '2F0', '3J0', '3K0']
sidx:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
finished filtering
  Saving current assembly.
In [21]:
data1.outfiles
Out[21]:
{'loci': '/home/deren/Documents/ipyrad/tests/test_pairddrad/data1_outfiles/data1.loci'}
In [28]:
less $data1.outfiles.loci
In [ ]:
import pandas as pd
print pd.read_table('test_pairddrad/test_pairddrad_consens/s5_consens.txt', delim_whitespace=1, header=0)
In [ ]:
## how to merge Assembly objects
data1.files.edits['1B0']
data1.samples['1B0'].files
In [ ]:
print data1.stats
In [ ]:
for i in data1.log:
    print i
In [ ]:
import numpy as np
In [ ]:
adds = np.ones([10, 4], dtype='int16')
adds.shape
In [ ]:
np.array.([30, 1], dtype='int16')
In [ ]:
longest_reads = 190
nreads = int(1e5)

arr = np.empty([nreads,200, 4], dtype='int16')
arr[0][0:adds.shape[0]] = adds
arr[0]
In [ ]:
import glob
import os
import numpy
In [ ]:
catg = numpy.load("test_pairddrad/test_pairddrad_consens/1B0.catg")
In [ ]:
catg[0]
In [ ]:
cats1 = glob.glob(os.path.join(
                      data1.dirs.consens,
                      data1.samples['1B0'].name+"_tmpcats.*"))
In [ ]:
cats1.sort(key=lambda x: int(x.split(".")[-1]))
In [ ]:
catg = numpy.load(cats1[0])
lastg = numpy.load(cats1[-1])
In [ ]:
catg[0]
In [ ]:
lastg.shape

Making alignment faster

In [ ]:
def alignfast(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    piped = subprocess.Popen(cmd, shell=True, 
                       stdin=subprocess.PIPE,
                       stdout=subprocess.PIPE,
                       stderr=subprocess.STDOUT,
                       close_fds=True)
    _, fout = piped.stdin, piped.stdout
    return fout.read()
In [ ]:
def alignfast(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    piped = subprocess.Popen(shlex.split(cmd), 
                       stdout=subprocess.PIPE)
    return piped.stdout.read()
In [ ]:
%%timeit
out = alignfast(data1, names, seqs)
In [ ]:
print out
In [ ]:
def newmuscle(data, names, seqs):
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    return subprocess.Popen(data.muscle, 
                            stdin=subprocess.PIPE, 
                            stdout=subprocess.PIPE)\
                            .communicate(inputstring)[0]
In [ ]:
%%timeit 
newmuscle(data1, names, seqs)
In [ ]:
def alignfast(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    piped = subprocess.check_output(cmd, shell=True)
    return piped
In [ ]:
out = alignfast(data1, names, seqs)
print out
In [ ]:
%%timeit
out = alignfast(data1, names, seqs)
In [ ]:
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
print inputstring
In [ ]:
pipe = subprocess.Popen(shlex.split(cmd), 
                        stdin=subprocess.PIPE, 
                        stderr=subprocess.PIPE, stdout=subprocess.PIPE)
pipe.stdin.write("muscle -h")
get = pipe.communicate(input=pipe)
get
In [ ]:
def alignfast2(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    
    piped = subprocess.Popen(shlex.split(cmd), 
                             stdin=subprocess.PIPE, 
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
    stdin, stdout = piped.communicate(input=cmd)
    return stdin, stdout
In [ ]:
def alignfast3(data, names, seqs):
    """ makes subprocess call to muscle """
    inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
    cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
    
    piped = subprocess.check_output(cmd,
                             shell=True)
    return piped
In [ ]:
def sortalign(stringnames):
    """ parses muscle output from a string to two lists """
    objs = stringnames.split("\n>")
    seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
              "".join(i.split('\n')[1:]) for i in objs]
              
    aligned = [i.split("\n") for i in seqs]
    newnames = [">"+i[0] for i in aligned]
    seqs = [i[1] for i in aligned]     
    ## return in sorted order by names
    sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), 
                                         key=lambda pair: pair[0]))]
    return sortedtups

get clusters

In [ ]:
import gzip
import subprocess
import shlex
infile = gzip.open("test_pairddrad/test_pairddrad_clust_0.85/1B0.clust.gz")
clusts = infile.read().split("//\n//\n")[:10]
In [ ]:
for clust in clusts:
    lines = clust.split("\n")
    names = lines[::2]
    seqs = lines[1::2]
    
seqs[0] = list(seqs[0])
seqs[0].insert(7, "AA")
seqs[0] = "".join(seqs[0])
In [ ]:
%%timeit
alignfast(data1, names, seqs)
In [ ]:
%%timeit 
alignfast3(data1, names, seqs)
In [ ]:
out = alignfast3(data1, names, seqs)
#sorts = sortalign(out)
print out
In [ ]:
out
In [ ]:
def sortalign(stringnames):
    """ parses muscle output from a string to two lists """
    objs = stringnames[1:].split("\n>")
    seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
              "".join(i.split('\n')[1:]) for i in objs]
              
    aligned = [i.split("\n") for i in seqs]
    newnames = [">"+i[0] for i in aligned]
    seqs = [i[1] for i in aligned]  
    
    ## return in sorted order by names
    sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), 
                                         key=lambda pair: pair[0]))]
    return sortedtups
In [ ]:
def parsemuscle(out):
    """ parse muscle string output into two sorted lists """
    lines = out[1:].split("\n>")
    names = [line.split("\n", 1)[0] for line in lines]
    seqs = [line.split("\n", 1)[1].replace("\n", "") for line in lines]
    tups = zip(names, seqs)
    anames, aseqs = zip(*sorted(tups, key=lambda x: int(x[0].split(";")[-1][1:])))
In [ ]: