import ipyrad as ip ## for RADseq assembly
print ip.__version__ ## print version
0.1.37
## clear existing test_dir/
import shutil
import os
if os.path.exists("./test_pairddrad/"):
shutil.rmtree("./test_pairddrad/")
data1 = ip.Assembly('data1')
New Assembly: data1
data1.get_params()
0 assembly_name data1 1 project_dir . 2 raw_fastq_path ./*.fastq 3 barcodes_path ./*.barcodes.txt 4 sorted_fastq_path 5 assembly_method denovo 6 reference_sequence 7 datatype rad 8 restriction_overhang ('TGCAG', '') 9 max_low_qual_bases 5 10 phred_Qscore_offset 33 11 mindepth_statistical 6 12 mindepth_majrule 6 13 maxdepth 1000 14 clust_threshold 0.85 15 max_barcode_mismatch 1 16 filter_adapters 0 17 filter_min_trim_len 35 18 max_alleles_consens 2 19 max_Ns_consens (5, 5) 20 max_Hs_consens (8, 8) 21 min_samples_locus 4 22 max_SNPs_locus (100, 100) 23 max_Indels_locus (5, 99) 24 max_shared_Hs_locus 0.25 25 edit_cutsites (0, 0) 26 trim_overhang (1, 2, 2, 1) 27 output_formats * 28 pop_assign_file 29 excludes 30 outgroups
data1.set_params('project_dir', "./test_pairddrad")
data1.set_params('raw_fastq_path', "./data/sim_pairddrad_*.gz")
data1.set_params('barcodes_path', "./data/sim_pairddrad_barcodes.txt")
data1.set_params('restriction_overhang', ("TGCAG", "AATT"))
data1.set_params('datatype', 'pairddrad')
#data1.set_params(17, 1)
data1.get_params()
0 assembly_name data1 1 project_dir ./test_pairddrad 2 raw_fastq_path ./data/sim_pairddrad_*.gz 3 barcodes_path ./data/sim_pairddrad_barcodes.txt 4 sorted_fastq_path 5 assembly_method denovo 6 reference_sequence 7 datatype pairddrad 8 restriction_overhang ('TGCAG', 'AATT') 9 max_low_qual_bases 5 10 phred_Qscore_offset 33 11 mindepth_statistical 6 12 mindepth_majrule 6 13 maxdepth 1000 14 clust_threshold 0.85 15 max_barcode_mismatch 1 16 filter_adapters 0 17 filter_min_trim_len 35 18 max_alleles_consens 2 19 max_Ns_consens (5, 5) 20 max_Hs_consens (8, 8) 21 min_samples_locus 4 22 max_SNPs_locus (100, 100) 23 max_Indels_locus (5, 99) 24 max_shared_Hs_locus 0.25 25 edit_cutsites (0, 0) 26 trim_overhang (1, 2, 2, 1) 27 output_formats * 28 pop_assign_file 29 excludes 30 outgroups
#data1.link_fastqs(path="test_pairddrad/test_pairddrad_fastqs/", append=True)
data1.step1()
print data1.stats
Saving current assembly. state reads_raw 1A0 1 20000 1B0 1 20000 1C0 1 20000 1D0 1 20000 2E0 1 20000 2F0 1 20000 2G0 1 20000 2H0 1 20000 3I0 1 20000 3J0 1 20000 3K0 1 20000 3L0 1 20000
data1.step2()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
Saving current assembly. state reads_raw reads_filtered 1A0 2 20000 20000 1B0 2 20000 20000 1C0 2 20000 20000 1D0 2 20000 20000 2E0 2 20000 20000 2F0 2 20000 20000 2G0 2 20000 20000 2H0 2 20000 20000 3I0 2 20000 20000 3J0 2 20000 20000 3K0 2 20000 20000 3L0 2 20000 20000
# data1.step3() ## do all samples
# data1.step3("1A0") ## do one sample
# data1.step3(["1A0", "1B0", "1C0"]) ## do list of samples
data1.step3()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
Saving current assembly. state reads_raw reads_filtered clusters_total clusters_hidepth 1A0 3 20000 20000 1000 1000 1B0 3 20000 20000 1000 1000 1C0 3 20000 20000 1000 1000 1D0 3 20000 20000 1000 1000 2E0 3 20000 20000 1000 1000 2F0 3 20000 20000 1000 1000 2G0 3 20000 20000 1000 1000 2H0 3 20000 20000 1000 1000 3I0 3 20000 20000 1000 1000 3J0 3 20000 20000 1000 1000 3K0 3 20000 20000 1000 1000 3L0 3 20000 20000 1000 1000
%%time
data1.step4()#["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
skipping 1B0; already estimated. Use force=True to overwrite. skipping 2H0; already estimated. Use force=True to overwrite. skipping 3J0; already estimated. Use force=True to overwrite. skipping 3K0; already estimated. Use force=True to overwrite. Saving current assembly. state reads_raw reads_filtered clusters_total clusters_hidepth \ 1A0 4 20000 20000 1000 1000 1B0 4 20000 20000 1000 1000 1C0 4 20000 20000 1000 1000 1D0 4 20000 20000 1000 1000 2E0 4 20000 20000 1000 1000 2F0 4 20000 20000 1000 1000 2G0 4 20000 20000 1000 1000 2H0 4 20000 20000 1000 1000 3I0 4 20000 20000 1000 1000 3J0 4 20000 20000 1000 1000 3K0 4 20000 20000 1000 1000 3L0 4 20000 20000 1000 1000 hetero_est error_est 1A0 0.001308 0.000488 1B0 0.001422 0.000494 1C0 0.001427 0.000482 1D0 0.001286 0.000497 2E0 0.001226 0.000491 2F0 0.001406 0.000479 2G0 0.001497 0.000506 2H0 0.001460 0.000502 3I0 0.001432 0.000504 3J0 0.001464 0.000483 3K0 0.001412 0.000502 3L0 0.001282 0.000470 CPU times: user 331 ms, sys: 4.28 ms, total: 336 ms Wall time: 7.07 s
#data1.save("upto2")
## still figuring out how best to save...
import ipyrad as ip
data1 = ip.load_assembly("test_pairddrad/test_pairddrad.assembly")
DEBUG:ipyrad:H4CKERZ-mode: __loglevel__ = DEBUG INFO:ipyrad.core.parallel:Local connection to 4 engines [ipyrad-2480]
Loading Assembly: test_pairddrad [test_pairddrad/test_pairddrad.assembly] ipyparallel setup: Local connection to 4 engines
data1.step5()#["1B0"], force=True)
print data1.stats
Saving current assembly. state reads_raw reads_filtered clusters_total clusters_hidepth \ 1A0 5 20000 20000 1000 1000 1B0 5 20000 20000 1000 1000 1C0 5 20000 20000 1000 1000 1D0 5 20000 20000 1000 1000 2E0 5 20000 20000 1000 1000 2F0 5 20000 20000 1000 1000 2G0 5 20000 20000 1000 1000 2H0 5 20000 20000 1000 1000 3I0 5 20000 20000 1000 1000 3J0 5 20000 20000 1000 1000 3K0 5 20000 20000 1000 1000 3L0 5 20000 20000 1000 1000 hetero_est error_est reads_consens 1A0 0.001308 0.000488 1000 1B0 0.001422 0.000494 1000 1C0 0.001427 0.000482 1000 1D0 0.001286 0.000497 1000 2E0 0.001226 0.000491 1000 2F0 0.001406 0.000479 1000 2G0 0.001497 0.000506 1000 2H0 0.001460 0.000502 1000 3I0 0.001432 0.000504 1000 3J0 0.001464 0.000483 1000 3K0 0.001412 0.000502 1000 3L0 0.001282 0.000470 1000
data1.step5(["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
data1.step6()#(["1B0", "2H0", "3J0", "3K0"], force=True)
print data1.stats
Saving current assembly. state reads_raw reads_filtered clusters_total clusters_hidepth \ 1A0 6 20000 20000 1000 1000 1B0 6 20000 20000 1000 1000 1C0 6 20000 20000 1000 1000 1D0 6 20000 20000 1000 1000 2E0 6 20000 20000 1000 1000 2F0 6 20000 20000 1000 1000 2G0 6 20000 20000 1000 1000 2H0 6 20000 20000 1000 1000 3I0 6 20000 20000 1000 1000 3J0 6 20000 20000 1000 1000 3K0 6 20000 20000 1000 1000 3L0 6 20000 20000 1000 1000 hetero_est error_est reads_consens 1A0 0.001308 0.000488 1000 1B0 0.001422 0.000494 1000 1C0 0.001427 0.000482 1000 1D0 0.001286 0.000497 1000 2E0 0.001226 0.000491 1000 2F0 0.001406 0.000479 1000 2G0 0.001497 0.000506 1000 2H0 0.001460 0.000502 1000 3I0 0.001432 0.000504 1000 3J0 0.001464 0.000483 1000 3K0 0.001412 0.000502 1000 3L0 0.001282 0.000470 1000
data1.step7()
inc ['1B0', '2G0', '1C0', '1A0', '2H0', '2E0', '3L0', '3I0', '1D0', '2F0', '3J0', '3K0'] sidx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] finished filtering Saving current assembly.
data1.outfiles
{'loci': '/home/deren/Documents/ipyrad/tests/test_pairddrad/data1_outfiles/data1.loci'}
less $data1.outfiles.loci
import pandas as pd
print pd.read_table('test_pairddrad/test_pairddrad_consens/s5_consens.txt', delim_whitespace=1, header=0)
## how to merge Assembly objects
data1.files.edits['1B0']
data1.samples['1B0'].files
print data1.stats
for i in data1.log:
print i
import numpy as np
adds = np.ones([10, 4], dtype='int16')
adds.shape
np.array.([30, 1], dtype='int16')
longest_reads = 190
nreads = int(1e5)
arr = np.empty([nreads,200, 4], dtype='int16')
arr[0][0:adds.shape[0]] = adds
arr[0]
import glob
import os
import numpy
catg = numpy.load("test_pairddrad/test_pairddrad_consens/1B0.catg")
catg[0]
cats1 = glob.glob(os.path.join(
data1.dirs.consens,
data1.samples['1B0'].name+"_tmpcats.*"))
cats1.sort(key=lambda x: int(x.split(".")[-1]))
catg = numpy.load(cats1[0])
lastg = numpy.load(cats1[-1])
catg[0]
lastg.shape
def alignfast(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.Popen(cmd, shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
close_fds=True)
_, fout = piped.stdin, piped.stdout
return fout.read()
def alignfast(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.Popen(shlex.split(cmd),
stdout=subprocess.PIPE)
return piped.stdout.read()
%%timeit
out = alignfast(data1, names, seqs)
print out
def newmuscle(data, names, seqs):
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
return subprocess.Popen(data.muscle,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)\
.communicate(inputstring)[0]
%%timeit
newmuscle(data1, names, seqs)
def alignfast(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.check_output(cmd, shell=True)
return piped
out = alignfast(data1, names, seqs)
print out
%%timeit
out = alignfast(data1, names, seqs)
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
print inputstring
pipe = subprocess.Popen(shlex.split(cmd),
stdin=subprocess.PIPE,
stderr=subprocess.PIPE, stdout=subprocess.PIPE)
pipe.stdin.write("muscle -h")
get = pipe.communicate(input=pipe)
get
def alignfast2(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.Popen(shlex.split(cmd),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdin, stdout = piped.communicate(input=cmd)
return stdin, stdout
def alignfast3(data, names, seqs):
""" makes subprocess call to muscle """
inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs))
cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -"
piped = subprocess.check_output(cmd,
shell=True)
return piped
def sortalign(stringnames):
""" parses muscle output from a string to two lists """
objs = stringnames.split("\n>")
seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
"".join(i.split('\n')[1:]) for i in objs]
aligned = [i.split("\n") for i in seqs]
newnames = [">"+i[0] for i in aligned]
seqs = [i[1] for i in aligned]
## return in sorted order by names
sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs),
key=lambda pair: pair[0]))]
return sortedtups
import gzip
import subprocess
import shlex
infile = gzip.open("test_pairddrad/test_pairddrad_clust_0.85/1B0.clust.gz")
clusts = infile.read().split("//\n//\n")[:10]
for clust in clusts:
lines = clust.split("\n")
names = lines[::2]
seqs = lines[1::2]
seqs[0] = list(seqs[0])
seqs[0].insert(7, "AA")
seqs[0] = "".join(seqs[0])
%%timeit
alignfast(data1, names, seqs)
%%timeit
alignfast3(data1, names, seqs)
out = alignfast3(data1, names, seqs)
#sorts = sortalign(out)
print out
out
def sortalign(stringnames):
""" parses muscle output from a string to two lists """
objs = stringnames[1:].split("\n>")
seqs = [i.split("\n")[0].replace(">", "")+"\n"+\
"".join(i.split('\n')[1:]) for i in objs]
aligned = [i.split("\n") for i in seqs]
newnames = [">"+i[0] for i in aligned]
seqs = [i[1] for i in aligned]
## return in sorted order by names
sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs),
key=lambda pair: pair[0]))]
return sortedtups
def parsemuscle(out):
""" parse muscle string output into two sorted lists """
lines = out[1:].split("\n>")
names = [line.split("\n", 1)[0] for line in lines]
seqs = [line.split("\n", 1)[1].replace("\n", "") for line in lines]
tups = zip(names, seqs)
anames, aseqs = zip(*sorted(tups, key=lambda x: int(x[0].split(";")[-1][1:])))