#!/usr/bin/env python # coding: utf-8 # ### ipyrad testing for pairddrad data # In[1]: import ipyrad as ip ## for RADseq assembly print ip.__version__ ## print version # In[2]: ## clear existing test_dir/ import shutil import os if os.path.exists("./test_pairddrad/"): shutil.rmtree("./test_pairddrad/") # In[7]: data1 = ip.Assembly('data1') # In[8]: data1.get_params() # In[11]: data1.set_params('project_dir', "./test_pairddrad") data1.set_params('raw_fastq_path', "./data/sim_pairddrad_*.gz") data1.set_params('barcodes_path', "./data/sim_pairddrad_barcodes.txt") data1.set_params('restriction_overhang', ("TGCAG", "AATT")) data1.set_params('datatype', 'pairddrad') #data1.set_params(17, 1) data1.get_params() # In[ ]: #data1.link_fastqs(path="test_pairddrad/test_pairddrad_fastqs/", append=True) # In[12]: data1.step1() print data1.stats # In[13]: data1.step2()#["1B0", "2H0", "3J0", "3K0"], force=True) print data1.stats # In[14]: # data1.step3() ## do all samples # data1.step3("1A0") ## do one sample # data1.step3(["1A0", "1B0", "1C0"]) ## do list of samples data1.step3()#["1B0", "2H0", "3J0", "3K0"], force=True) print data1.stats # In[16]: get_ipython().run_cell_magic('time', '', 'data1.step4()#["1B0", "2H0", "3J0", "3K0"], force=True) \nprint data1.stats\n') # In[1]: #data1.save("upto2") ## still figuring out how best to save... import ipyrad as ip data1 = ip.load_assembly("test_pairddrad/test_pairddrad.assembly") # In[17]: data1.step5()#["1B0"], force=True) print data1.stats # In[ ]: data1.step5(["1B0", "2H0", "3J0", "3K0"], force=True) print data1.stats # In[18]: data1.step6()#(["1B0", "2H0", "3J0", "3K0"], force=True) print data1.stats # In[19]: data1.step7() # In[21]: data1.outfiles # In[28]: less $data1.outfiles.loci # In[ ]: import pandas as pd print pd.read_table('test_pairddrad/test_pairddrad_consens/s5_consens.txt', delim_whitespace=1, header=0) # In[ ]: ## how to merge Assembly objects data1.files.edits['1B0'] data1.samples['1B0'].files # In[ ]: print data1.stats # In[ ]: for i in data1.log: print i # In[ ]: import numpy as np # In[ ]: adds = np.ones([10, 4], dtype='int16') adds.shape # In[ ]: np.array.([30, 1], dtype='int16') # In[ ]: longest_reads = 190 nreads = int(1e5) arr = np.empty([nreads,200, 4], dtype='int16') arr[0][0:adds.shape[0]] = adds arr[0] # In[ ]: import glob import os import numpy # In[ ]: catg = numpy.load("test_pairddrad/test_pairddrad_consens/1B0.catg") # In[ ]: catg[0] # In[ ]: cats1 = glob.glob(os.path.join( data1.dirs.consens, data1.samples['1B0'].name+"_tmpcats.*")) # In[ ]: cats1.sort(key=lambda x: int(x.split(".")[-1])) # In[ ]: catg = numpy.load(cats1[0]) lastg = numpy.load(cats1[-1]) # In[ ]: catg[0] # In[ ]: lastg.shape # ### Making alignment faster # In[ ]: def alignfast(data, names, seqs): """ makes subprocess call to muscle """ inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs)) cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -" piped = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, close_fds=True) _, fout = piped.stdin, piped.stdout return fout.read() # In[ ]: def alignfast(data, names, seqs): """ makes subprocess call to muscle """ inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs)) cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -" piped = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) return piped.stdout.read() # In[ ]: get_ipython().run_cell_magic('timeit', '', 'out = alignfast(data1, names, seqs)\n') # In[ ]: print out # In[ ]: def newmuscle(data, names, seqs): inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs)) return subprocess.Popen(data.muscle, stdin=subprocess.PIPE, stdout=subprocess.PIPE)\ .communicate(inputstring)[0] # In[ ]: get_ipython().run_cell_magic('timeit', '', 'newmuscle(data1, names, seqs)\n') # In[ ]: def alignfast(data, names, seqs): """ makes subprocess call to muscle """ inputstring = "\n".join(">"+i+"\n"+j for i, j in zip(names, seqs)) cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -" piped = subprocess.check_output(cmd, shell=True) return piped # In[ ]: out = alignfast(data1, names, seqs) print out # In[ ]: get_ipython().run_cell_magic('timeit', '', 'out = alignfast(data1, names, seqs)\n') # In[ ]: inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs)) print inputstring # In[ ]: pipe = subprocess.Popen(shlex.split(cmd), stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE) pipe.stdin.write("muscle -h") get = pipe.communicate(input=pipe) get # In[ ]: def alignfast2(data, names, seqs): """ makes subprocess call to muscle """ inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs)) cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -" piped = subprocess.Popen(shlex.split(cmd), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdin, stdout = piped.communicate(input=cmd) return stdin, stdout # In[ ]: def alignfast3(data, names, seqs): """ makes subprocess call to muscle """ inputstring = "\n".join(i+"\n"+j for i, j in zip(names, seqs)) cmd = "/bin/echo '"+inputstring+"' | "+data.muscle+" -quiet -in -" piped = subprocess.check_output(cmd, shell=True) return piped # In[ ]: def sortalign(stringnames): """ parses muscle output from a string to two lists """ objs = stringnames.split("\n>") seqs = [i.split("\n")[0].replace(">", "")+"\n"+\ "".join(i.split('\n')[1:]) for i in objs] aligned = [i.split("\n") for i in seqs] newnames = [">"+i[0] for i in aligned] seqs = [i[1] for i in aligned] ## return in sorted order by names sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), key=lambda pair: pair[0]))] return sortedtups # #### get clusters # In[ ]: import gzip import subprocess import shlex infile = gzip.open("test_pairddrad/test_pairddrad_clust_0.85/1B0.clust.gz") clusts = infile.read().split("//\n//\n")[:10] # In[ ]: for clust in clusts: lines = clust.split("\n") names = lines[::2] seqs = lines[1::2] seqs[0] = list(seqs[0]) seqs[0].insert(7, "AA") seqs[0] = "".join(seqs[0]) # In[ ]: get_ipython().run_cell_magic('timeit', '', 'alignfast(data1, names, seqs)\n') # In[ ]: get_ipython().run_cell_magic('timeit', '', 'alignfast3(data1, names, seqs)\n') # In[ ]: out = alignfast3(data1, names, seqs) #sorts = sortalign(out) print out # In[ ]: out # In[ ]: def sortalign(stringnames): """ parses muscle output from a string to two lists """ objs = stringnames[1:].split("\n>") seqs = [i.split("\n")[0].replace(">", "")+"\n"+\ "".join(i.split('\n')[1:]) for i in objs] aligned = [i.split("\n") for i in seqs] newnames = [">"+i[0] for i in aligned] seqs = [i[1] for i in aligned] ## return in sorted order by names sortedtups = [(i, j) for i, j in zip(*sorted(zip(newnames, seqs), key=lambda pair: pair[0]))] return sortedtups # In[ ]: def parsemuscle(out): """ parse muscle string output into two sorted lists """ lines = out[1:].split("\n>") names = [line.split("\n", 1)[0] for line in lines] seqs = [line.split("\n", 1)[1].replace("\n", "") for line in lines] tups = zip(names, seqs) anames, aseqs = zip(*sorted(tups, key=lambda x: int(x[0].split(";")[-1][1:]))) # In[ ]: