!ls -l !ls -l SIMsmall/ %%bash ## you must have the egglib Python module installed to do this step. python simradsMANU.py 0.00 0 10000 1 SIMsmall/simRADs_no.fastq python simradsMANU.py 0.02 0 10000 1 SIMsmall/simRADs_low.fastq python simradsMANU.py 0.05 0 10000 1 SIMsmall/simRADs_high.fastq !head -n12 SIMsmall/simRADs_no.fastq %%bash ## create working directories for each PyRAD analysis mkdir SIMsmall/Py_no mkdir SIMsmall/Py_low mkdir SIMsmall/Py_high %time ! ~/pyrad_v.1.62/pyRAD -p SIMsmall/params_no.txt 2> SIMsmall/log.pyno %time ! ~/pyrad_v.1.62/pyRAD -p SIMsmall/params_low.txt 2> SIMsmall/log.pylow %time ! ~/pyrad_v.1.62/pyRAD -p SIMsmall/params_high.txt 2> SIMsmall/log.pyhigh %%bash ## The following script removes unneeded files leftover from the analysis to save memory space. rm -r SIMsmall/Py_no/fastq/ SIMsmall/Py_no/edits/ rm -r SIMsmall/Py_low/fastq/ SIMsmall/Py_low/edits/ rm -r SIMsmall/Py_high/fastq/ SIMsmall/Py_high/edits/ %%bash ## create working directories for each stacks analysis mkdir SIMsmall/stackf_no mkdir SIMsmall/stackf_low mkdir SIMsmall/stackf_high %%bash ## create a barcode file for stacks analyses cut -f 2 SIMsmall/simRADs_no.fastq.barcodes > SIMsmall/simRADs_no.stacks.barcodes cut -f 2 SIMsmall/simRADs_low.fastq.barcodes > SIMsmall/simRADs_low.stacks.barcodes cut -f 2 SIMsmall/simRADs_high.fastq.barcodes > SIMsmall/simRADs_high.stacks.barcodes %%bash process_radtags -f SIMsmall/simRADs_no.fastq -o SIMsmall/stackf_no/ \ -b SIMsmall/simRADs_no.stacks.barcodes -e pstI -E phred33 import glob def makestackslist(dset): infiles = ["-s "+ff+" " for ff in glob.glob("SIMsmall/stackf_"+dset+"/*.fq")] denovo = "denovo_map.pl -m 2 -M 13 -n 15 -T 12 -b 1 -B 1 -S -X "+\ "'populations:-m 6' -D 'SIMsmall_no' -o SIMsmall/stackf_"+dset+" "+"".join(infiles) with open("SIMsmall/stacks_"+dset+".sh",'w') as outfile: outfile.write(denovo) makestackslist('no') !cat SIMsmall/stacks_no.sh %time ! sh SIMsmall/stacks_no.sh 2> SIMsmall/log.stackno %%bash process_radtags -f SIMsmall/simRADs_low.fastq -o SIMsmall/stackf_low/ \ -b SIMsmall/simRADs_low.stacks.barcodes -e pstI -E phred33 makestackslist("low") !cat SIMsmall/stacks_low.sh %time ! sh SIMsmall/stacks_low.sh 2> SIMsmall/log.stacklow %%bash process_radtags -f SIMsmall/simRADs_high.fastq -o SIMsmall/stackf_high/ \ -b SIMsmall/simRADs_high.stacks.barcodes -e pstI -E phred33 makestackslist("high") !cat SIMsmall/stacks_high.sh %time ! sh SIMsmall/stacks_high.sh 2> SIMsmall/log.stackhigh ## Stacks results ## lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines() cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] shigh = [cnts.count(i) for i in range(1,13)] SHIGH = [int(sum(shigh)-sum(shigh[:i-1])) for i in range(1,13)] lines = open("SIMsmall/stackf_low/batch_1.haplotypes.tsv").readlines() cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] slow = [cnts.count(i) for i in range(1,13)] SLOW = [int(sum(slow)-sum(slow[:i-1])) for i in range(1,13)] lines = open("SIMsmall/stackf_no/batch_1.haplotypes.tsv").readlines() cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] sno = [cnts.count(i) for i in range(1,13)] SNO = [int(sum(sno)-sum(sno[:i-1])) for i in range(1,13)] ## PyRAD results ## infile = open("SIMsmall/Py_no/stats/c85d6m2pN.stats").readlines() pno = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)] PNO = [sum(pno)-sum(pno[:i-1]) for i in range(1,13)] infile = open("SIMsmall/Py_low/stats/c85d6m2pN.stats").readlines() plow = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)] PLOW = [sum(plow)-sum(plow[:i-1]) for i in range(1,13)] infile = open("SIMsmall/Py_high/stats/c85d6m2pN.stats").readlines() phigh = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)] PHIGH = [sum(phigh)-sum(phigh[:i-1]) for i in range(1,13)] ## convert PyRAD lists to R objects ## %load_ext rmagic %Rpush PNO PLOW PHIGH SNO SLOW SHIGH %%R RES = cbind(PNO,PLOW,PHIGH,SNO,SLOW,SHIGH) print(RES) %%R makeplot <- function() { par(mfcol=c(1,3), cex.axis=1.1, cex.main=1.1, cex.lab=1.1, mar=c(4,4,3,0), oma=c(1,2,0,2)) plot(RES[,1], ylim=c(0,18000),xlim=c(1,12), xlab="Minimum taxon coverage", ylab="Number of loci recovered", main="No indels") points(RES[,1], pch=20, col='black', cex=2.5) lines(RES[,1], lwd=3) points(RES[,4], pch=20, col='darkgrey', cex=1.35) lines(RES[,4], lwd=2.5, col='darkgrey') plot(RES[,2], ylim=c(0,18000),xlim=c(1,12), xlab="Minimum taxon coverage", ylab="",#loci with at least N taxa", main="Low indels") points(RES[,2], pch=20, col='black', cex=2.5) lines(RES[,2], lwd=3) points(RES[,5], pch=20, col='darkgrey', cex=1.35) lines(RES[,5], lwd=2.5, col='darkgrey') plot(RES[,3], ylim=c(0,18000),xlim=c(1,12), xlab="Minimum taxon coverage", ylab="",#loci with at least N taxa", main="High indels") points(RES[,3], pch=20, col='black', cex=2.5) lines(RES[,3], lwd=3) points(RES[,6], pch=20, col='darkgrey', cex=1.35) lines(RES[,6], lwd=2.5, col='darkgrey') } %%R pdf("SIMsmall/FIG_1.pdf", width=6.5, height=2.8, family="Times") makeplot() dev.off() %R makeplot() import glob RES = [] for run in ["no", "low", "high"]: handle = "SIMsmall/stackf_"+run+"/sample_*.tags.tsv" c = [open(ff,'r').read().count("consensus") for ff in glob.glob(handle)] RES.append(['stacks',run,float(sum(c))/len(c)]) handle = "SIMsmall/Py_"+run+"/stats/s5.consens.txt" c = map(float,[line.split("\t")[3] for line in open(handle,'r').readlines()[1:-7]]) RES.append(['pyrad',run,float(sum(c))/len(c)]) for res in RES: print 'mean N loci per sample from %s indel %s run \t%f' % (res[1],res[0],res[2]) infile = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines() tcov = [line.split("\t")[1] for line in infile[1:]] print mean(map(int,tcov)), 'mean coverage stacks highindel' infile = open("SIMsmall/stackf_low/batch_1.haplotypes.tsv").readlines() tcov = [line.split("\t")[1] for line in infile[1:]] print mean(map(int,tcov)), 'mean coverage stacks lowindel' infile = open("SIMsmall/stackf_no/batch_1.haplotypes.tsv").readlines() tcov = [line.split("\t")[1] for line in infile[1:]] print mean(map(int,tcov)), 'mean coverage stacks noindel' infile = open("SIMsmall/Py_high/stats/c85d6m2pN.stats").readlines()[26:38] tcov = [] for line in infile: a = line.split("\t") tcov += [int(a[0])]*int(a[1]) print mean(tcov), 'mean coverage pyrad highindel' infile = open("SIMsmall/Py_low/stats/c85d6m2pN.stats").readlines()[26:38] tcov = [] for line in infile: a = line.split("\t") tcov += [int(a[0])]*int(a[1]) print mean(tcov), 'mean coverage pyrad lowhindel' infile = open("SIMsmall/Py_no/stats/c85d6m2pN.stats").readlines()[26:38] tcov = [] for line in infile: a = line.split("\t") tcov += [int(a[0])]*int(a[1]) print mean(tcov), 'mean coverage pyrad noindel' %time ! ~/pyrad_v.1.62/pyRAD -p REALDATA/paramsPAR.txt -s 2 ! mkdir REALDATA/edits4stacks for ff in glob.glob("REALDATA/edits/*.edit"): dat = open(ff).readlines() outfile = open("REALDATA/edits4stacks/"+ff.split("/")[-1].replace(".edit",".fq"),'w') writing = [] for line in dat: if ">" in line: x,y = line.split("_")[2:4] nn = "@GRC13_0027_FC:4:1:"+x+":"+y+"#0/1" writing.append(nn.strip()) else: seq = "TGCAG"+line.strip() writing.append(seq) writing.append("+") writing.append("b"*len(seq)+"\n") outfile.write("\n".join(writing)) writing = [] outfile.close() %time ! ~/pyrad_v.1.62/pyRAD -p REALDATA/paramsNOPAR.txt -s 34567 2> REALDATA/log.py.txt %%bash mv REALDATA/stats REALDATA/stats.nopar mkdir REALDATA/stats rm REALDATA/clust.85/*.consens ~/pyrad_v.1.62/pyRAD -p REALDATA/paramsPAR.txt -s 4567 2> REALDATA/log.py2.txt !mkdir REALDATA/stacks85 import glob infiles = ["-s "+ff+" " for ff in glob.glob("REALDATA/edits4stacks/*")] denovo = "denovo_map.pl -m 2 -M 10 -N 10 -n 10 -T 12 -b 1 -B 1 -S -X "+\ "'populations:-m 6' -X 'ustacks:--max_locus_stacks 2' -D 'stacks85' "+\ "-o REALDATA/stacks85 "+"".join(infiles) with open("REALDATA/stacks85/stacks.sh",'w') as outfile: outfile.write(denovo) !cat REALDATA/stacks85/stacks.sh %time !sh REALDATA/stacks85/stacks.sh 2> REALDATA/log.stacks.txt infile = open("REALDATA/stats.nopar/c85d6m2pN.stats").readlines() pno = [int(infile[j].strip().split("\t")[1]) for j in range(27,40)] PEMPN = [sum(pno)-sum(pno[:i-1]) for i in range(1,14)] infile = open("REALDATA/stats/c85d6m2p3H3N3.stats").readlines() fno = [int(infile[j].strip().split("\t")[1]) for j in range(27,40)] PEMPF = [sum(fno)-sum(fno[:i-1]) for i in range(1,14)] handle = "REALDATA/stacks85/batch_1.haplotypes.tsv" lines = open(handle).readlines() FILTER = [] NOFILTER = [] for line in lines[1:]: dat = line.strip().split("\t")[1:] cnt = int(dat[0]) ocnt = int(dat[0]) hets = [] for loc in dat: " remove loci with more than 2 alleles" if loc.count("/") > 1: dat.remove(loc) cnt -= 1 else: if "/" in loc: a,b = loc.split("/") " remove loci with more than 4 hetero sites " if len(a) > 3: dat.remove(loc) cnt -= 1 else: " remove stack if a hetero site is shared across more than 4 samples" h = [i for i in range(len(a)) if a[i]!=b[i]] for i in h: hets.append(i) if [hets.count(i) for i in set(hets)]: if max([hets.count(i) for i in set(hets)]) > 4: cnt = 0 FILTER.append(cnt) NOFILTER.append(ocnt) filt = [FILTER.count(i) for i in range(1,14)] nofilt = [NOFILTER.count(i) for i in range(1,14)] SEMPF = [int(sum(filt)-sum(filt[:i-1])) for i in range(1,14)] SEMPN = [int(sum(nofilt)-sum(nofilt[:i-1])) for i in range(1,14)] print "mean coverage stacks no filter", mean(NOFILTER) print "%fullcov stacks no filter",float([NOFILTER.count(13)][0])/sum([FILTER.count(i) for i in range(1,14)])*100 print "" print "mean coverage stacks paralog filtered", mean(FILTER) print "%fullcov stacks filtered",float([FILTER.count(13)][0])/sum([FILTER.count(i) for i in range(1,14)])*100 def meancovpyrad(lines): tcov = [] for line in lines: a = line.split("\t") tcov += [int(a[0])]*int(a[1]) full = 100*float(tcov.count(13))/len(tcov) return mean(tcov), full infile = open("REALDATA/stats.nopar/c85d6m2pN.stats").readlines()[27:40] nofiltpycov, fullnofiltpy = meancovpyrad(infile) infile = open("REALDATA/stats/c85d6m2p3H3N3.stats").readlines()[27:40] filtpycov, fullfiltpy = meancovpyrad(infile) print "mean coverage pyrad no filter", nofiltpycov print "%fullcov pyrad no filter", fullnofiltpy print "" print "mean coverage pyrad paralog filtered", filtpycov print "%fullcov pyrad no filter", fullfiltpy %load_ext rmagic %Rpush SEMPF SEMPN PEMPF PEMPN %%R DIFFN <- c(SEMPN-PEMPN) DIFFF <- c(SEMPF-PEMPF) %%R REALRES = cbind(PEMPN,SEMPN,PEMPF,SEMPF,DIFFN,DIFFF) print(REALRES) %%R diffplot <- function() { par(cex.axis=1.1, cex.main=1.1, cex.lab=1.6, mar=c(4,4,3,0), oma=c(1,2,0,2)) D <- log(abs(DIFFN)) * c(-1,-1,-1,rep(1,10)) E <- log(abs(DIFFF)) * c(-1,-1,-1,rep(1,10)) names(D) <- seq(1,13) df.bar <- barplot(D, col=c(rep('grey',3),rep('black',10)), ylim=c(-12,10), space=0.5, ylab="Log difference in number of loci recovered", xlab="Minimum taxon coverage",axes=F) box() abline(h=0,lwd=2,lty=2) points(df.bar, E, bg=c(rep('grey',3),rep('black',10)), cex=2, pch=21) axis(2,c(-10,-5,0,5,10),c("10","5","0","5","10")) } diffplot() %%R pdf("diffplot.pdf", width=7, height=6, family='Times') diffplot() dev.off() def counts(infile, mincov): lines = [i.split("\n") for i in infile] INDS = [] for loc in lines[:-1]: seqs = [] for line in loc: if ">" in line: seqs.append(line.split(" ")[-1]) if len(seqs) >= mincov: N = numpy.array([list(i) for i in seqs]) cols = iter(N.transpose()) inds = 0 col = cols.next() while 1: if "-" in col: try: col = cols.next() except StopIteration: break if "-" not in col: inds += 1 else: try: col = cols.next() except StopIteration: break INDS.append(inds) i = [i for i in INDS if i!=0] return float(len(i))/len(INDS) #infile1 = open("REALDATA/outfiles/c85d6m2pN.loci").read().split("//") infile1 = open("REALDATA/outfiles/c85d6m2p3H3N3.loci").read().split("//") infile2 = open("SIMsmall/Py_high/outfiles/c85d6m2pN.loci").read().split("//") infile3 = open("SIMsmall/Py_low/outfiles/c85d6m2pN.loci").read().split("//") for i in [13,12,10,8,6,4]: dat = counts(infile1,i) print 'cyatho',i,"%0.3f" % dat indelesthigh = counts(infile2,12) indelestlow = counts(infile3,12) print 'simhigh', 12, "%0.3f" % indelesthigh print 'simlow', 12, "%0.3f" % indelestlow !python simradsMANU.py 0.0 50 500000 1 SIMbig/big12.fastq %%bash mkdir SIMbig/big0/ mkdir SIMbig/big4/ %time ! ~/pyrad_v.1.62/pyRAD -p SIMbig/paramsbig.txt -s 12345 2> SIMbig/log.big %time ! ~/pyrad_v.1.62/pyRAD -p SIMbig/paramsbig.txt -s67 2> SIMbig/log.big %%bash mv SIMbig/big0/clust.85/ SIMbig/big4/ mkdir SIMbig/big4/stats %time ! ~/pyrad_v.1.62/pyRAD -p SIMbig/paramsbigH4.txt -s 67 2> SIMbig/log.bigH4 %%bash mkdir SIMbig/stackf cut -f 2 SIMbig/big12.fastq.barcodes > SIMbig/big12.stacks.barcodes ! process_radtags -f SIMbig/big12.fastq -o SIMbig/stackf -b SIMbig/big12.stacks.barcodes -e pstI -E phred33 makestackslist("nom") infiles = ["-s "+ff+" " for ff in glob.glob("SIMbig/stackf/*.fq")] denovo = "denovo_map.pl -m 2 -M 13 -n 15 -T 12 -b 1 -B 1 -S -X "+\ "'populations:-m 6' -D 'SimRADsBIG' -o SIMbig/stackf "+"".join(infiles) with open("SIMbig/stacks.sh",'w') as outfile: outfile.write(denovo) %time ! sh SIMbig/stacks.sh infile = open("SIMbig/stackf/batch_1.haplotypes.tsv").readlines() tcov = [line.split("\t")[1] for line in infile[1:]] print mean(map(int,tcov)), 'mean coverage stacks' infile = open("SIMbig/big0/stats/big.c85d6m2pN.stats").readlines()[26:38] tcov = [] for line in infile: a = line.split("\t") tcov += [int(a[0])]*int(a[1]) print mean(tcov), 'mean coverage pyrad' infile = open("SIMbig/big4/stats/big.c85d6m2pN.stats").readlines()[26:38] tcov = [] for line in infile: a = line.split("\t") tcov += [int(a[0])]*int(a[1]) print mean(tcov), 'mean coverage pyrad hierarchical' infile = open("SIMbig/big0/stats/big.c85d6m2pN.stats").readlines() phigh = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)] P0 = [sum(phigh)-sum(phigh[:i-1]) for i in range(1,13)] infile = open("SIMbig/big4/stats/big.c85d6m2pN.stats").readlines() phigh = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)] P4 = [sum(phigh)-sum(phigh[:i-1]) for i in range(1,13)] lines = open("SIMbig/stackf/batch_1.haplotypes.tsv").readlines() cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] sno = [cnts.count(i) for i in range(1,13)] S0 = [int(sum(sno)-sum(sno[:i-1])) for i in range(1,13)] print array([P0,P4,S0]).transpose() 100*(253./500000)