!ls -l

!ls -l SIMsmall/

%%bash
## you must have the egglib Python module installed to do this step. 
python simradsMANU.py 0.00 0 10000 1 SIMsmall/simRADs_no.fastq
python simradsMANU.py 0.02 0 10000 1 SIMsmall/simRADs_low.fastq
python simradsMANU.py 0.05 0 10000 1 SIMsmall/simRADs_high.fastq

!head -n12 SIMsmall/simRADs_no.fastq

%%bash
## create working directories for each PyRAD analysis
mkdir SIMsmall/Py_no
mkdir SIMsmall/Py_low
mkdir SIMsmall/Py_high

%time ! ~/pyrad_v.1.62/pyRAD -p SIMsmall/params_no.txt   2> SIMsmall/log.pyno

%time ! ~/pyrad_v.1.62/pyRAD -p SIMsmall/params_low.txt  2> SIMsmall/log.pylow

%time ! ~/pyrad_v.1.62/pyRAD -p SIMsmall/params_high.txt 2> SIMsmall/log.pyhigh

%%bash
## The following script removes unneeded files leftover from the analysis to save memory space.
rm -r SIMsmall/Py_no/fastq/   SIMsmall/Py_no/edits/ 
rm -r SIMsmall/Py_low/fastq/  SIMsmall/Py_low/edits/  
rm -r SIMsmall/Py_high/fastq/ SIMsmall/Py_high/edits/

%%bash
## create working directories for each stacks analysis
mkdir SIMsmall/stackf_no
mkdir SIMsmall/stackf_low
mkdir SIMsmall/stackf_high

%%bash
## create a barcode file for stacks analyses
cut -f 2 SIMsmall/simRADs_no.fastq.barcodes > SIMsmall/simRADs_no.stacks.barcodes
cut -f 2 SIMsmall/simRADs_low.fastq.barcodes > SIMsmall/simRADs_low.stacks.barcodes
cut -f 2 SIMsmall/simRADs_high.fastq.barcodes > SIMsmall/simRADs_high.stacks.barcodes

%%bash
process_radtags -f SIMsmall/simRADs_no.fastq -o SIMsmall/stackf_no/ \
                       -b SIMsmall/simRADs_no.stacks.barcodes -e pstI -E phred33

import glob   
def makestackslist(dset):
    infiles = ["-s "+ff+" " for ff in glob.glob("SIMsmall/stackf_"+dset+"/*.fq")]
    denovo = "denovo_map.pl -m 2 -M 13 -n 15 -T 12 -b 1 -B 1 -S -X "+\
             "'populations:-m 6' -D 'SIMsmall_no' -o SIMsmall/stackf_"+dset+" "+"".join(infiles)
    with open("SIMsmall/stacks_"+dset+".sh",'w') as outfile:
        outfile.write(denovo)
        

makestackslist('no')

!cat SIMsmall/stacks_no.sh

%time ! sh SIMsmall/stacks_no.sh 2> SIMsmall/log.stackno

%%bash
process_radtags -f SIMsmall/simRADs_low.fastq -o SIMsmall/stackf_low/ \
                       -b SIMsmall/simRADs_low.stacks.barcodes -e pstI -E phred33

makestackslist("low")

!cat SIMsmall/stacks_low.sh

%time ! sh SIMsmall/stacks_low.sh 2> SIMsmall/log.stacklow

%%bash
process_radtags -f SIMsmall/simRADs_high.fastq -o SIMsmall/stackf_high/ \
                       -b SIMsmall/simRADs_high.stacks.barcodes -e pstI -E phred33

makestackslist("high")

!cat SIMsmall/stacks_high.sh

%time ! sh SIMsmall/stacks_high.sh 2> SIMsmall/log.stackhigh

## Stacks results ##
lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
shigh = [cnts.count(i) for i in range(1,13)]
SHIGH = [int(sum(shigh)-sum(shigh[:i-1])) for i in range(1,13)]

lines = open("SIMsmall/stackf_low/batch_1.haplotypes.tsv").readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
slow = [cnts.count(i) for i in range(1,13)]
SLOW = [int(sum(slow)-sum(slow[:i-1])) for i in range(1,13)]

lines = open("SIMsmall/stackf_no/batch_1.haplotypes.tsv").readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
sno = [cnts.count(i) for i in range(1,13)]
SNO = [int(sum(sno)-sum(sno[:i-1])) for i in range(1,13)]

## PyRAD results ##
infile = open("SIMsmall/Py_no/stats/c85d6m2pN.stats").readlines()
pno = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)]
PNO = [sum(pno)-sum(pno[:i-1]) for i in range(1,13)]

infile = open("SIMsmall/Py_low/stats/c85d6m2pN.stats").readlines()
plow = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)]
PLOW = [sum(plow)-sum(plow[:i-1]) for i in range(1,13)]

infile = open("SIMsmall/Py_high/stats/c85d6m2pN.stats").readlines()
phigh = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)]
PHIGH = [sum(phigh)-sum(phigh[:i-1]) for i in range(1,13)]

## convert PyRAD lists to R objects ##
%load_ext rmagic 
%Rpush PNO PLOW PHIGH SNO SLOW SHIGH

%%R
RES = cbind(PNO,PLOW,PHIGH,SNO,SLOW,SHIGH)
print(RES)

%%R
makeplot <- function() {
    par(mfcol=c(1,3),
    cex.axis=1.1,
    cex.main=1.1,
    cex.lab=1.1,
    mar=c(4,4,3,0), 
    oma=c(1,2,0,2))

    plot(RES[,1], ylim=c(0,18000),xlim=c(1,12),
        xlab="Minimum taxon coverage",
        ylab="Number of loci recovered",
        main="No indels")

    points(RES[,1], pch=20, col='black', cex=2.5)
    lines(RES[,1], lwd=3)
    points(RES[,4], pch=20, col='darkgrey', cex=1.35)
    lines(RES[,4], lwd=2.5, col='darkgrey')

    plot(RES[,2], ylim=c(0,18000),xlim=c(1,12),
        xlab="Minimum taxon coverage",
        ylab="",#loci with at least N taxa",
        main="Low indels")

    points(RES[,2], pch=20, col='black', cex=2.5)
    lines(RES[,2], lwd=3)
    points(RES[,5], pch=20, col='darkgrey', cex=1.35)
    lines(RES[,5], lwd=2.5, col='darkgrey')

    plot(RES[,3], ylim=c(0,18000),xlim=c(1,12),
        xlab="Minimum taxon coverage",
        ylab="",#loci with at least N taxa",
        main="High indels")

    points(RES[,3], pch=20, col='black', cex=2.5)
    lines(RES[,3], lwd=3)
    points(RES[,6], pch=20, col='darkgrey', cex=1.35)
    lines(RES[,6], lwd=2.5, col='darkgrey')
}

%%R
pdf("SIMsmall/FIG_1.pdf",
    width=6.5, height=2.8,
    family="Times")
makeplot()
dev.off()

%R makeplot()

import glob
RES = []
for run in ["no", "low", "high"]:
    handle = "SIMsmall/stackf_"+run+"/sample_*.tags.tsv"
    c = [open(ff,'r').read().count("consensus") for ff in glob.glob(handle)]
    RES.append(['stacks',run,float(sum(c))/len(c)])
    
    handle = "SIMsmall/Py_"+run+"/stats/s5.consens.txt"
    c = map(float,[line.split("\t")[3] for line in open(handle,'r').readlines()[1:-7]]) 
    RES.append(['pyrad',run,float(sum(c))/len(c)])

for res in RES:
    print 'mean N loci per sample from %s indel %s run  \t%f' % (res[1],res[0],res[2])


infile = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
tcov = [line.split("\t")[1] for line in infile[1:]]
print mean(map(int,tcov)), 'mean coverage stacks highindel'

infile = open("SIMsmall/stackf_low/batch_1.haplotypes.tsv").readlines()
tcov = [line.split("\t")[1] for line in infile[1:]]
print mean(map(int,tcov)), 'mean coverage stacks lowindel'

infile = open("SIMsmall/stackf_no/batch_1.haplotypes.tsv").readlines()
tcov = [line.split("\t")[1] for line in infile[1:]]
print mean(map(int,tcov)), 'mean coverage stacks noindel'

infile = open("SIMsmall/Py_high/stats/c85d6m2pN.stats").readlines()[26:38]
tcov = []
for line in infile:
    a = line.split("\t")
    tcov += [int(a[0])]*int(a[1]) 
print mean(tcov), 'mean coverage pyrad highindel'

infile = open("SIMsmall/Py_low/stats/c85d6m2pN.stats").readlines()[26:38]
tcov = []
for line in infile:
    a = line.split("\t")
    tcov += [int(a[0])]*int(a[1]) 
print mean(tcov), 'mean coverage pyrad lowhindel'

infile = open("SIMsmall/Py_no/stats/c85d6m2pN.stats").readlines()[26:38]
tcov = []
for line in infile:
    a = line.split("\t")
    tcov += [int(a[0])]*int(a[1]) 
print mean(tcov), 'mean coverage pyrad noindel'


%time ! ~/pyrad_v.1.62/pyRAD -p REALDATA/paramsPAR.txt -s 2

! mkdir REALDATA/edits4stacks

for ff in glob.glob("REALDATA/edits/*.edit"):
    dat = open(ff).readlines()
    outfile = open("REALDATA/edits4stacks/"+ff.split("/")[-1].replace(".edit",".fq"),'w')
    writing = []
    for line in dat:
        if ">" in line:
            x,y = line.split("_")[2:4]
            nn = "@GRC13_0027_FC:4:1:"+x+":"+y+"#0/1"
            writing.append(nn.strip())
        else:
            seq = "TGCAG"+line.strip()
            writing.append(seq)
            writing.append("+")
            writing.append("b"*len(seq)+"\n")
            outfile.write("\n".join(writing))
            writing = []
    outfile.close()
    

%time ! ~/pyrad_v.1.62/pyRAD -p REALDATA/paramsNOPAR.txt -s 34567 2> REALDATA/log.py.txt

%%bash
mv REALDATA/stats REALDATA/stats.nopar
mkdir REALDATA/stats
rm REALDATA/clust.85/*.consens

~/pyrad_v.1.62/pyRAD -p REALDATA/paramsPAR.txt -s 4567 2> REALDATA/log.py2.txt

!mkdir REALDATA/stacks85

import glob
infiles = ["-s "+ff+" " for ff in glob.glob("REALDATA/edits4stacks/*")]
denovo = "denovo_map.pl -m 2 -M 10 -N 10 -n 10 -T 12 -b 1 -B 1 -S -X "+\
         "'populations:-m 6' -X 'ustacks:--max_locus_stacks 2' -D 'stacks85' "+\
         "-o REALDATA/stacks85 "+"".join(infiles)
with open("REALDATA/stacks85/stacks.sh",'w') as outfile:
    outfile.write(denovo)

!cat REALDATA/stacks85/stacks.sh

%time !sh REALDATA/stacks85/stacks.sh 2> REALDATA/log.stacks.txt

infile = open("REALDATA/stats.nopar/c85d6m2pN.stats").readlines()
pno = [int(infile[j].strip().split("\t")[1]) for j in range(27,40)]
PEMPN = [sum(pno)-sum(pno[:i-1]) for i in range(1,14)]

infile = open("REALDATA/stats/c85d6m2p3H3N3.stats").readlines()
fno = [int(infile[j].strip().split("\t")[1]) for j in range(27,40)]
PEMPF = [sum(fno)-sum(fno[:i-1]) for i in range(1,14)]

handle = "REALDATA/stacks85/batch_1.haplotypes.tsv"
lines = open(handle).readlines()
FILTER = []
NOFILTER = []
for line in lines[1:]:
    dat = line.strip().split("\t")[1:]
    cnt = int(dat[0])
    ocnt = int(dat[0])
    hets = []
    for loc in dat:
        " remove loci with more than 2 alleles"
        if loc.count("/") > 1:
            dat.remove(loc)
            cnt -= 1
        else:
            if "/" in loc:
                a,b = loc.split("/")  
                " remove loci with more than 4 hetero sites "
                if len(a) > 3:
                    dat.remove(loc)
                    cnt -= 1
                else:
                    " remove stack if a hetero site is shared across more than 4 samples"
                    h = [i for i in range(len(a)) if a[i]!=b[i]]
                    for i in h:
                        hets.append(i)
    if [hets.count(i) for i in set(hets)]:
        if max([hets.count(i) for i in set(hets)]) > 4:
            cnt = 0
    FILTER.append(cnt)
    NOFILTER.append(ocnt)

filt = [FILTER.count(i) for i in range(1,14)]
nofilt = [NOFILTER.count(i) for i in range(1,14)]
SEMPF = [int(sum(filt)-sum(filt[:i-1])) for i in range(1,14)]
SEMPN = [int(sum(nofilt)-sum(nofilt[:i-1])) for i in range(1,14)]

print "mean coverage stacks no filter", mean(NOFILTER)
print "%fullcov stacks no filter",float([NOFILTER.count(13)][0])/sum([FILTER.count(i) for i in range(1,14)])*100
print ""
print "mean coverage stacks paralog filtered", mean(FILTER)
print "%fullcov stacks filtered",float([FILTER.count(13)][0])/sum([FILTER.count(i) for i in range(1,14)])*100


def meancovpyrad(lines):
    tcov = []
    for line in lines:
        a = line.split("\t")
        tcov += [int(a[0])]*int(a[1])
    full = 100*float(tcov.count(13))/len(tcov)
    return mean(tcov), full


infile = open("REALDATA/stats.nopar/c85d6m2pN.stats").readlines()[27:40]
nofiltpycov, fullnofiltpy = meancovpyrad(infile)
infile = open("REALDATA/stats/c85d6m2p3H3N3.stats").readlines()[27:40]
filtpycov, fullfiltpy = meancovpyrad(infile)
print "mean coverage pyrad no filter", nofiltpycov
print "%fullcov pyrad no filter", fullnofiltpy
print ""
print "mean coverage pyrad paralog filtered", filtpycov
print "%fullcov pyrad no filter", fullfiltpy


%load_ext rmagic 
%Rpush SEMPF SEMPN PEMPF PEMPN

%%R 
DIFFN <- c(SEMPN-PEMPN)
DIFFF <- c(SEMPF-PEMPF)

%%R
REALRES = cbind(PEMPN,SEMPN,PEMPF,SEMPF,DIFFN,DIFFF)
print(REALRES)

%%R
diffplot <- function() {
    par(cex.axis=1.1,
    cex.main=1.1,
    cex.lab=1.6,
    mar=c(4,4,3,0), 
    oma=c(1,2,0,2))
    D <- log(abs(DIFFN)) * c(-1,-1,-1,rep(1,10))
    E <- log(abs(DIFFF)) * c(-1,-1,-1,rep(1,10))
    names(D) <- seq(1,13)
    df.bar <- barplot(D, col=c(rep('grey',3),rep('black',10)),
              ylim=c(-12,10), space=0.5,
              ylab="Log difference in number of loci recovered",
              xlab="Minimum taxon coverage",axes=F)
    box()
    abline(h=0,lwd=2,lty=2)
    points(df.bar, E, bg=c(rep('grey',3),rep('black',10)),
           cex=2, pch=21)
    axis(2,c(-10,-5,0,5,10),c("10","5","0","5","10"))
}    
diffplot()

%%R
pdf("diffplot.pdf", width=7, height=6, family='Times')
diffplot()
dev.off()

def counts(infile, mincov):
    lines = [i.split("\n") for i in infile]
    INDS = []
    for loc in lines[:-1]:
        seqs = []
        for line in loc:
            if ">" in line:
                seqs.append(line.split(" ")[-1])
        if len(seqs) >= mincov:
            N = numpy.array([list(i) for i in seqs])
            cols = iter(N.transpose())
            inds = 0
            col = cols.next()
            while 1:
                if "-" in col:
                    try: col = cols.next()
                    except StopIteration: break
                    if "-" not in col:
                        inds += 1
                else:
                    try: col = cols.next()
                    except StopIteration: break
            INDS.append(inds)
    i = [i for i in INDS if i!=0]
    return float(len(i))/len(INDS)

#infile1 = open("REALDATA/outfiles/c85d6m2pN.loci").read().split("//")
infile1 = open("REALDATA/outfiles/c85d6m2p3H3N3.loci").read().split("//")
infile2 = open("SIMsmall/Py_high/outfiles/c85d6m2pN.loci").read().split("//")
infile3 = open("SIMsmall/Py_low/outfiles/c85d6m2pN.loci").read().split("//")

for i in [13,12,10,8,6,4]:
    dat = counts(infile1,i)
    print 'cyatho',i,"%0.3f" % dat
indelesthigh = counts(infile2,12)
indelestlow = counts(infile3,12)
print 'simhigh', 12, "%0.3f"  %  indelesthigh
print 'simlow', 12, "%0.3f"  %  indelestlow

!python simradsMANU.py 0.0 50 500000 1 SIMbig/big12.fastq

%%bash
mkdir SIMbig/big0/
mkdir SIMbig/big4/

%time ! ~/pyrad_v.1.62/pyRAD -p SIMbig/paramsbig.txt -s 12345 2> SIMbig/log.big

%time ! ~/pyrad_v.1.62/pyRAD -p SIMbig/paramsbig.txt -s67 2> SIMbig/log.big

%%bash
mv SIMbig/big0/clust.85/ SIMbig/big4/
mkdir SIMbig/big4/stats

%time ! ~/pyrad_v.1.62/pyRAD -p SIMbig/paramsbigH4.txt  -s 67 2> SIMbig/log.bigH4

%%bash
mkdir SIMbig/stackf
cut -f 2 SIMbig/big12.fastq.barcodes > SIMbig/big12.stacks.barcodes

! process_radtags -f SIMbig/big12.fastq -o SIMbig/stackf -b SIMbig/big12.stacks.barcodes -e pstI -E phred33

makestackslist("nom")

infiles = ["-s "+ff+" " for ff in glob.glob("SIMbig/stackf/*.fq")]
denovo = "denovo_map.pl -m 2 -M 13 -n 15 -T 12 -b 1 -B 1 -S -X "+\
         "'populations:-m 6' -D 'SimRADsBIG' -o SIMbig/stackf "+"".join(infiles)
with open("SIMbig/stacks.sh",'w') as outfile:
    outfile.write(denovo)

%time ! sh SIMbig/stacks.sh

infile = open("SIMbig/stackf/batch_1.haplotypes.tsv").readlines()
tcov = [line.split("\t")[1] for line in infile[1:]]
print mean(map(int,tcov)), 'mean coverage stacks'

infile = open("SIMbig/big0/stats/big.c85d6m2pN.stats").readlines()[26:38]
tcov = []
for line in infile:
    a = line.split("\t")
    tcov += [int(a[0])]*int(a[1]) 
print mean(tcov), 'mean coverage pyrad'

infile = open("SIMbig/big4/stats/big.c85d6m2pN.stats").readlines()[26:38]
tcov = []
for line in infile:
    a = line.split("\t")
    tcov += [int(a[0])]*int(a[1]) 
print mean(tcov), 'mean coverage pyrad hierarchical'

infile = open("SIMbig/big0/stats/big.c85d6m2pN.stats").readlines()
phigh = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)]
P0 = [sum(phigh)-sum(phigh[:i-1]) for i in range(1,13)]

infile = open("SIMbig/big4/stats/big.c85d6m2pN.stats").readlines()
phigh = [int(infile[j].strip().split("\t")[1]) for j in range(26,38)]
P4 = [sum(phigh)-sum(phigh[:i-1]) for i in range(1,13)]

lines = open("SIMbig/stackf/batch_1.haplotypes.tsv").readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
sno = [cnts.count(i) for i in range(1,13)]
S0 = [int(sum(sno)-sum(sno[:i-1])) for i in range(1,13)]

print array([P0,P4,S0]).transpose()

100*(253./500000)