import ipyrad as ip ## for RADseq assembly
print ip.__version__ ## print version
0.1.26
## clear test directory if it already exists
import shutil
import os
if os.path.exists("./test_pairgbs"):
shutil.rmtree("./test_pairgbs")
The first step is to create an Assembly object. It takes an optional argument that provides it with an internal name. We could imagine that we planned to assemble and later combine data from multiple sequencing runs, but before combining them each group of samples has to be analyzed under a different set of parameters. As an example, we could call this data set "2014_data_set" and another "2015_data_set".
## create an Assembly object called data1.
## It takes an 'test'
data1 = ip.Assembly('test_pairgbs')
New Assembly: test_pairgbs
data1.set_params("project_dir", "./test_pairgbs")
data1.set_params("raw_fastq_path", "./data/sim_pairgbs_*.gz")
data1.set_params("barcodes_path", "./data/sim_pairgbs_barcodes.txt")
data1.set_params("datatype", "pairgbs")
#data1.set_params(17, 0)
#data1.set_params(19, 1)
data1.get_params()
1 project_dir ./test_pairgbs 2 raw_fastq_path ./data/sim_pairgbs_*.gz 3 barcodes_path ./data/sim_pairgbs_barcodes.txt 4 sorted_fastq_path 5 assembly_method denovo 6 reference_sequence 7 datatype pairgbs 8 restriction_overhang ('TGCAG', '') 9 max_low_qual_bases 5 10 phred_Qscore_offset 33 11 mindepth_statistical 6 12 mindepth_majrule 6 13 maxdepth 1000 14 clust_threshold 0.85 15 max_barcode_mismatch 1 16 filter_adapters 0 17 filter_min_trim_len 35 18 max_alleles_consens 2 19 max_Ns_consens (5, 5) 20 max_Hs_consens (8, 8) 21 min_samples_locus 4 22 max_SNPs_locus (100, 100) 23 max_Indels_locus (5, 99) 24 max_shared_Hs_locus 0.25 25 edit_cutsites (0, 0) 26 trim_overhang (1, 2, 2, 1) 27 output_formats * 28 pop_assign_file 29 excludes 30 outgroups
This demultiplexes, and links the new fastq data files as Samples to the Assembly object.
## demultiplex the raw_data files
## set step1 to only go if no samples are present...
data1.step1() #append=True)
print data1.stats
state reads_raw 1A0 1 4000 1B0 1 4000 1C0 1 4000 1D0 1 4000 2E0 1 4000 2F0 1 4000 2G0 1 4000 2H0 1 4000 3I0 1 4000 3J0 1 4000 3K0 1 4000 3L0 1 4000
#data1.set_params(4, "./test_pairgbs/fastq/*")
#data1.link_fastqs()
If for some reason we wanted to execute on just a subsample of our data, we could do this by selecting only certain samples to call the step2
function on. Because step2
is a function of data
, it will always execute with the parameters that are linked to data
.
data1.step2(force=True) #["1A0","1B0"])#
print data1.stats
state reads_raw reads_filtered 1A0 2 4000 4000 1B0 2 4000 4000 1C0 2 4000 4000 1D0 2 4000 4000 2E0 2 4000 4000 2F0 2 4000 4000 2G0 2 4000 4000 2H0 2 4000 4000 3I0 2 4000 4000 3J0 2 4000 4000 3K0 2 4000 4000 3L0 2 4000 4000
We can access the name
and fname
of the Sample
objects and edit them as desired without affecting the original data files. The name
identifier is equal to the filename (fname
) by default, but is the name used in the final output files, and thus it may be desirable to reduce it to something more readable, like below.
import ipyrad as ip
data1 = ip.load.load_assembly("test_pairgbs/test_pairgbs.assembly")
loading Assembly: test_pairgbs [test_pairgbs/test_pairgbs.assembly]
data1.step3()
print data1.stats
state reads_raw reads_filtered reads_merged clusters_total \ 1A0 3 4000 4000 4000 200 1B0 3 4000 4000 4000 200 1C0 3 4000 4000 4000 200 1D0 3 4000 4000 4000 200 2E0 3 4000 4000 4000 200 2F0 3 4000 4000 4000 200 2G0 3 4000 4000 4000 200 2H0 3 4000 4000 4000 200 3I0 3 4000 4000 4000 200 3J0 3 4000 4000 4000 200 3K0 3 4000 4000 4000 200 3L0 3 4000 4000 4000 200 clusters_hidepth 1A0 200 1B0 200 1C0 200 1D0 200 2E0 200 2F0 200 2G0 200 2H0 200 3I0 200 3J0 200 3K0 200 3L0 200
data1.step4()
print data1.stats
[3:apply]: ---------------------------------------------------------------------------ValueError Traceback (most recent call last)<string> in <module>() /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in optim(args) 255 (bfreqs, ustacks, counts), 256 disp=False, --> 257 full_output=False) 258 return [sample.name, hetero, errors] 259 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in fmin(func, x0, args, xtol, ftol, maxiter, maxfun, full_output, disp, retall, callback) 375 'return_all': retall} 376 --> 377 res = _minimize_neldermead(func, x0, args, callback=callback, **opts) 378 if full_output: 379 retlist = res['x'], res['fun'], res['nit'], res['nfev'], res['status'] /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_neldermead(func, x0, args, callback, xtol, ftol, maxiter, maxfev, disp, return_all, **unknown_options) 433 if retall: 434 allvecs = [sim[0]] --> 435 fsim[0] = func(x0) 436 nonzdelt = 0.05 437 zdelt = 0.00025 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in function_wrapper(*wrapper_args) 283 def function_wrapper(*wrapper_args): 284 ncalls[0] += 1 --> 285 return function(*(wrapper_args + args)) 286 287 return ncalls, function_wrapper /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in get_diploid_lik(pstart, bfreqs, ustacks, counts) 94 else: 95 ## get likelihood for all sites ---> 96 lik1 = (1.-hetero)*likelihood1(errors, bfreqs, ustacks) 97 lik2 = (hetero)*likelihood2(errors, bfreqs, ustacks) 98 liks = lik1+lik2 /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in likelihood1(errors, bfreqs, ustacks) 46 ## make sure base_frequencies are in the right order 47 #print uniqstackl.sum()-uniqstack, uniqstackl.sum(), 0.001 ---> 48 totals = np.array([ustacks.sum(axis=1)]*4).T 49 prob = scipy.stats.binom.pmf(totals-ustacks, totals, errors) 50 return np.sum(bfreqs*prob, axis=1) /home/deren/anaconda/lib/python2.7/site-packages/numpy/core/_methods.pyc in _sum(a, axis, dtype, out, keepdims) 30 31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False): ---> 32 return umr_sum(a, axis, dtype, out, keepdims) 33 34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False): ValueError: 'axis' entry is out of bounds [0:apply]: ---------------------------------------------------------------------------ValueError Traceback (most recent call last)<string> in <module>() /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in optim(args) 255 (bfreqs, ustacks, counts), 256 disp=False, --> 257 full_output=False) 258 return [sample.name, hetero, errors] 259 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in fmin(func, x0, args, xtol, ftol, maxiter, maxfun, full_output, disp, retall, callback) 375 'return_all': retall} 376 --> 377 res = _minimize_neldermead(func, x0, args, callback=callback, **opts) 378 if full_output: 379 retlist = res['x'], res['fun'], res['nit'], res['nfev'], res['status'] /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_neldermead(func, x0, args, callback, xtol, ftol, maxiter, maxfev, disp, return_all, **unknown_options) 433 if retall: 434 allvecs = [sim[0]] --> 435 fsim[0] = func(x0) 436 nonzdelt = 0.05 437 zdelt = 0.00025 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in function_wrapper(*wrapper_args) 283 def function_wrapper(*wrapper_args): 284 ncalls[0] += 1 --> 285 return function(*(wrapper_args + args)) 286 287 return ncalls, function_wrapper /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in get_diploid_lik(pstart, bfreqs, ustacks, counts) 94 else: 95 ## get likelihood for all sites ---> 96 lik1 = (1.-hetero)*likelihood1(errors, bfreqs, ustacks) 97 lik2 = (hetero)*likelihood2(errors, bfreqs, ustacks) 98 liks = lik1+lik2 /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in likelihood1(errors, bfreqs, ustacks) 46 ## make sure base_frequencies are in the right order 47 #print uniqstackl.sum()-uniqstack, uniqstackl.sum(), 0.001 ---> 48 totals = np.array([ustacks.sum(axis=1)]*4).T 49 prob = scipy.stats.binom.pmf(totals-ustacks, totals, errors) 50 return np.sum(bfreqs*prob, axis=1) /home/deren/anaconda/lib/python2.7/site-packages/numpy/core/_methods.pyc in _sum(a, axis, dtype, out, keepdims) 30 31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False): ---> 32 return umr_sum(a, axis, dtype, out, keepdims) 33 34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False): ValueError: 'axis' entry is out of bounds [1:apply]: ---------------------------------------------------------------------------ValueError Traceback (most recent call last)<string> in <module>() /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in optim(args) 255 (bfreqs, ustacks, counts), 256 disp=False, --> 257 full_output=False) 258 return [sample.name, hetero, errors] 259 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in fmin(func, x0, args, xtol, ftol, maxiter, maxfun, full_output, disp, retall, callback) 375 'return_all': retall} 376 --> 377 res = _minimize_neldermead(func, x0, args, callback=callback, **opts) 378 if full_output: 379 retlist = res['x'], res['fun'], res['nit'], res['nfev'], res['status'] /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_neldermead(func, x0, args, callback, xtol, ftol, maxiter, maxfev, disp, return_all, **unknown_options) 433 if retall: 434 allvecs = [sim[0]] --> 435 fsim[0] = func(x0) 436 nonzdelt = 0.05 437 zdelt = 0.00025 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in function_wrapper(*wrapper_args) 283 def function_wrapper(*wrapper_args): 284 ncalls[0] += 1 --> 285 return function(*(wrapper_args + args)) 286 287 return ncalls, function_wrapper /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in get_diploid_lik(pstart, bfreqs, ustacks, counts) 94 else: 95 ## get likelihood for all sites ---> 96 lik1 = (1.-hetero)*likelihood1(errors, bfreqs, ustacks) 97 lik2 = (hetero)*likelihood2(errors, bfreqs, ustacks) 98 liks = lik1+lik2 /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in likelihood1(errors, bfreqs, ustacks) 46 ## make sure base_frequencies are in the right order 47 #print uniqstackl.sum()-uniqstack, uniqstackl.sum(), 0.001 ---> 48 totals = np.array([ustacks.sum(axis=1)]*4).T 49 prob = scipy.stats.binom.pmf(totals-ustacks, totals, errors) 50 return np.sum(bfreqs*prob, axis=1) /home/deren/anaconda/lib/python2.7/site-packages/numpy/core/_methods.pyc in _sum(a, axis, dtype, out, keepdims) 30 31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False): ---> 32 return umr_sum(a, axis, dtype, out, keepdims) 33 34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False): ValueError: 'axis' entry is out of bounds [2:apply]: ---------------------------------------------------------------------------ValueError Traceback (most recent call last)<string> in <module>() /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in optim(args) 255 (bfreqs, ustacks, counts), 256 disp=False, --> 257 full_output=False) 258 return [sample.name, hetero, errors] 259 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in fmin(func, x0, args, xtol, ftol, maxiter, maxfun, full_output, disp, retall, callback) 375 'return_all': retall} 376 --> 377 res = _minimize_neldermead(func, x0, args, callback=callback, **opts) 378 if full_output: 379 retlist = res['x'], res['fun'], res['nit'], res['nfev'], res['status'] /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in _minimize_neldermead(func, x0, args, callback, xtol, ftol, maxiter, maxfev, disp, return_all, **unknown_options) 433 if retall: 434 allvecs = [sim[0]] --> 435 fsim[0] = func(x0) 436 nonzdelt = 0.05 437 zdelt = 0.00025 /home/deren/anaconda/lib/python2.7/site-packages/scipy/optimize/optimize.pyc in function_wrapper(*wrapper_args) 283 def function_wrapper(*wrapper_args): 284 ncalls[0] += 1 --> 285 return function(*(wrapper_args + args)) 286 287 return ncalls, function_wrapper /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in get_diploid_lik(pstart, bfreqs, ustacks, counts) 94 else: 95 ## get likelihood for all sites ---> 96 lik1 = (1.-hetero)*likelihood1(errors, bfreqs, ustacks) 97 lik2 = (hetero)*likelihood2(errors, bfreqs, ustacks) 98 liks = lik1+lik2 /home/deren/Documents/ipyrad/ipyrad/assemble/jointestimate.pyc in likelihood1(errors, bfreqs, ustacks) 46 ## make sure base_frequencies are in the right order 47 #print uniqstackl.sum()-uniqstack, uniqstackl.sum(), 0.001 ---> 48 totals = np.array([ustacks.sum(axis=1)]*4).T 49 prob = scipy.stats.binom.pmf(totals-ustacks, totals, errors) 50 return np.sum(bfreqs*prob, axis=1) /home/deren/anaconda/lib/python2.7/site-packages/numpy/core/_methods.pyc in _sum(a, axis, dtype, out, keepdims) 30 31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False): ---> 32 return umr_sum(a, axis, dtype, out, keepdims) 33 34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False): ValueError: 'axis' entry is out of bounds ... 8 more exceptions ...
data1.step5(["1A0"])
#data1.step5(["1A0"]) ## better filters for -N-
ip.get_params_info(10)
A common problem after struggling through an analysis is that you find you've completely forgotten what parameters you used at what point, and when you changed them. The log history time stamps all calls to set_params()
, as well as calls to step
methods. It also records copies/branching of data objects.
for i in data.log:
print i
Assembly objects can be saved and loaded so that interactive analyses can be started, stopped, and returned to quite easily. The format of these saved files is a serialized 'pickle' object used by Python. Individual Sample objects are saved within Assembly objects. While it is important to remember that some of the information in Assembly objects is in their links to data files, most of the useful information that we would want to analyze post assembly is stored in the object itself. Thus these objects will be useful for making plots and tables of assembly statistics later.
## save assembly object
#ip.save_assembly("data1.p")
## load assembly object
#data = ip.load_assembly("data1.p")
#print data.name
indels = []
catg = []
arcatg = np.array([[5, 0, 0, 0],
[0, 5, 0, 0],
[0, 0, 5, 0],
[0, 0, 0, 5]],
[[5, 0, 0, 0],
[0, 5, 0, 0],
[0, 0, 5, 0],
[0, 0, 0, 5]], dtype=uint32)
arcatg