#!/usr/bin/env python # coding: utf-8 # ### ipyrad testing # In[1]: import ipyrad as ip ## for RADseq assembly print ip.__version__ ## print version # In[2]: ## clear test directory if it already exists import shutil import os if os.path.exists("./test_pairgbs"): shutil.rmtree("./test_pairgbs") # ### Getting started -- Assembly objects # The first step is to create an Assembly object. It takes an optional argument that provides it with an internal name. We could imagine that we planned to assemble and later combine data from multiple sequencing runs, but before combining them each group of samples has to be analyzed under a different set of parameters. As an example, we could call this data set "2014_data_set" and another "2015_data_set". # In[3]: ## create an Assembly object called data1. ## It takes an 'test' data1 = ip.Assembly('test_pairgbs') # In[5]: data1.set_params("project_dir", "./test_pairgbs") data1.set_params("raw_fastq_path", "./data/sim_pairgbs_*.gz") data1.set_params("barcodes_path", "./data/sim_pairgbs_barcodes.txt") data1.set_params("datatype", "pairgbs") #data1.set_params(17, 0) #data1.set_params(19, 1) # In[6]: data1.get_params() # ### Step 1: Demultiplex the raw data files # This demultiplexes, and links the new fastq data files as Samples to the Assembly object. # In[8]: ## demultiplex the raw_data files ## set step1 to only go if no samples are present... data1.step1() #append=True) print data1.stats # In[ ]: #data1.set_params(4, "./test_pairgbs/fastq/*") #data1.link_fastqs() # ### Step 2: Filter reads # If for some reason we wanted to execute on just a subsample of our data, we could do this by selecting only certain samples to call the `step2` function on. Because `step2` is a function of `data`, it will always execute with the parameters that are linked to `data`. # In[9]: data1.step2(force=True) #["1A0","1B0"])# print data1.stats # We can access the `name` and `fname` of the `Sample` objects and edit them as desired without affecting the original data files. The `name` identifier is equal to the filename (`fname`) by default, but is the name used in the final output files, and thus it may be desirable to reduce it to something more readable, like below. # In[1]: import ipyrad as ip data1 = ip.load.load_assembly("test_pairgbs/test_pairgbs.assembly") # In[11]: data1.step3() # In[3]: print data1.stats # In[2]: data1.step4() print data1.stats # In[ ]: data1.step5(["1A0"]) # In[ ]: #data1.step5(["1A0"]) ## better filters for -N- # In[ ]: # ### Quick parameter explanations are always on-hand # In[ ]: ip.get_params_info(10) # ### Log history # A common problem after struggling through an analysis is that you find you've completely forgotten what parameters you used at what point, and when you changed them. The log history time stamps all calls to `set_params()`, as well as calls to `step` methods. It also records copies/branching of data objects. # In[ ]: for i in data.log: print i # ### Saving Assembly objects # Assembly objects can be saved and loaded so that interactive analyses can be started, stopped, and returned to quite easily. The format of these saved files is a serialized 'pickle' object used by Python. Individual Sample objects are saved within Assembly objects. While it is important to remember that some of the information in Assembly objects is in their links to data files, most of the useful information that we would want to analyze post assembly is stored in the object itself. Thus these objects will be useful for making plots and tables of assembly statistics later. # In[ ]: ## save assembly object #ip.save_assembly("data1.p") ## load assembly object #data = ip.load_assembly("data1.p") #print data.name # ### figureing out insert # In[ ]: indels = [] catg = [] # In[ ]: arcatg = np.array([[5, 0, 0, 0], [0, 5, 0, 0], [0, 0, 5, 0], [0, 0, 0, 5]], [[5, 0, 0, 0], [0, 5, 0, 0], [0, 0, 5, 0], [0, 0, 0, 5]], dtype=uint32) # In[ ]: arcatg