#!/usr/bin/env python
# coding: utf-8

# # Results for ipyrad vs stacks vs dDocent simulated and empirical reference sequence assembly
# I'm assuming here that you've already run the de novo assemblies and have already installed all the prereqs for
# analysing the results. If not run the ipyrad-manuscript-results.ipynb.

# In[1]:


## Imports and working/output directories directories

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib
plt.rcParams["figure.figsize"] = [12,9]

from collections import Counter
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
import toyplot
import toyplot.html    ## toypot sublib for saving html plots
import pandas as pd
import numpy as np
import subprocess
import collections
import allel
import vcfnp
import shutil
import gzip
import glob
import os
from allel.util import * ## for ensure_square()

WORK_DIR="/home/iovercast/manuscript-analysis/"

## Simulation dirs
REFMAP_SIM_DIR = os.path.join(WORK_DIR, "REFMAP_SIM/")
IPYRAD_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ipyrad/reference-assembly/")
STACKS_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "stacks/")
DDOCENT_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ddocent/")
## A list of the simluated assembler names for indexing dicts
assemblers = ["ipyrad-reference", "ipyrad-denovo_reference", "stacks", "ddocent"]

## Empirical dirs
REFMAP_EMPIRICAL_DIR=os.path.join(WORK_DIR, "Phocoena_empirical/")
IPYRAD_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ipyrad/")
STACKS_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "stacks/")
DDOCENT_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ddocent/")


# # Some helpful functions

# ## Function for plotting PCA given an input vcf file

# In[334]:


## LD pruning code from the scikit-allele cookbook
def plot_ld(gn, title):
    m = allel.stats.rogers_huff_r(gn) ** 2
    ax = allel.plot.pairwise_ld(m)
    ax.set_title(title)
def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5):
    for i in range(n_iter):
        loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn

def plotPCA(call_data, title):
    c = call_data
    g = allel.GenotypeArray(c.genotype)
    ac = g.count_alleles()
    ## Filter singletons and multi-allelic snps
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
    gf = g.compress(flt, axis=0)
    gn = gf.to_n_alt()
    coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(1, 1, 1)
    sns.despine(ax=ax, offset=5)
    x = coords1[:, 0]
    y = coords1[:, 1]
    
    ## We know this works because the species_dict and the columns in the vcf
    ## are in the same order. 
    for sp in species:
        flt = (np.array(species_dict.values()) == sp)
        ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
    ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
    ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold')
    fig.tight_layout()

def getPCA(call_data):
    c = call_data
    g = allel.GenotypeArray(c.genotype)
    ac = g.count_alleles()
    ## Filter singletons and multi-allelic snps
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
    gf = g.compress(flt, axis=0)
    gn = gf.to_n_alt()
    
    ## Test w/ ld pruning. Doesn't appreciably change the results.
    #gnu = ld_prune(gn)
    #gn = gnu

    coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
    return coords1, model1

## We don't actually use this function bcz the allele.plot.pairwise_distance()
## call returns a raster which looks nasty in print.
def plotPairwiseDistance(call_data, title):
    c = call_data
    #c = vcfnp.calldata_2d(filename).view(np.recarray)
    g = allel.GenotypeArray(c.genotype)
    gn = g.to_n_alt()

    dist = allel.stats.pairwise_distance(gn, metric='euclidean')
    allel.plot.pairwise_distance(dist, labels=species_dict.keys())

def getDistances(call_data):
    c = call_data
    #c = vcfnp.calldata_2d(filename).view(np.recarray)
    g = allel.GenotypeArray(c.genotype)
    gn = g.to_n_alt()

    dist = allel.stats.pairwise_distance(gn, metric='euclidean')
    return(dist)


# ## Function for plotting distribution of variable sites across loci

# In[3]:


## Inputs to this function are two Counters where keys
## are the base position and values are snp counts.
## The counter doesn't have to be sorted because we sort internally.
def SNP_position_plot(prog, distvar, distpis):
    
    ## The last position to consider
    maxend = np.array(distvar.keys()).max()
    
    ## This does two things, first it sorts in increasing
    ## order. Second, it creates a count bin for any position
    ## without snps and sets the count to 0.
    distvar = [distvar[x] for x in xrange(maxend)]
    distpis = [distpis[x] for x in xrange(maxend)]

    ## set color theme
    colormap = toyplot.color.Palette()

    ## make a canvas
    canvas = toyplot.Canvas(width=800, height=300)

    ## make axes
    axes = canvas.cartesian(xlabel="Position along RAD loci",
                       ylabel="N variables sites",
                       gutter=65)
    axes.label.text = prog

    ## x-axis
    axes.x.ticks.show = True
    axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"}
    axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"}
    axes.x.ticks.below = 5
    axes.x.ticks.above = 0
    axes.x.domain.max = maxend
    axes.x.ticks.locator = toyplot.locator.Explicit(
        range(0, maxend, 5), 
        map(str, range(0, maxend, 5)))
    
    ## y-axis
    axes.y.ticks.show=True
    axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"}
    axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"}
    axes.y.ticks.below = 0
    axes.y.ticks.above = 5

    ## add fill plots
    x = np.arange(0, maxend)
    f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites")
    f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites")

    ## add a horizontal dashed line at the median Nsnps per site
    axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"})
    axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"})
    
    return canvas, axes


# ## Functions for polling stats from vcf call data

# In[4]:


import numpy_indexed as npi
    
## Get the number of samples with data at each snp
def snp_coverage(call_data):
    snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."])
    ## Fill zero values
    return [snp_counts[x] for x in xrange(1, np.array(snp_counts.keys()).max()+1)]

## Get the number of samples with data at each locus
def loci_coverage(var_data, call_data, assembler):
    if "stacks" in assembler:
        loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./."))
    else:
        loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./."))
    counts_per_snp = []
    for z in xrange(0, len(loci)):
        counts_per_snp.append([x.sum() for x in loci[z][1]])
    counts = collections.Counter([np.max(x) for x in counts_per_snp])
    
    ## Fill all zero values
    return [counts[x] for x in xrange(1, np.array(counts.keys()).max()+1)]

## Get total number of snps per sample
def sample_nsnps(call_data):
    return [x.sum() for x in call_data["GT"].T != "./."]

## Get total number of loci per sample
def sample_nloci(var_data, call_data, assembler):
    if "stacks" in assembler:
        locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")
    else:
        locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.")
        
    by_locus = [x.T for x in locus_groups[1]]
    by_sample = np.array([(x).any(axis=1) for x in by_locus])
    return [x.sum() for x in by_sample.T]


# ## End housekeeping. Begin actual analysis of results.
# 
# # Results from simulated data
# 
# ## First lets look at results just from the vcf files (so this is only looking at variable loci).
# The first thing we'll do is create a dataframe for storing a bunch
# of coverage information from the runs for each method.

# In[29]:


## Make a new pandas dataframe for holding the coverage results
sim_vcf_dict = {}
sim_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf")
sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf")
sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf")
sim_vcf_dict["stacks"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf")
sim_vcf_dict["ddocent"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
## Make sure we have all the vcf files
for k, f in sim_vcf_dict.items():
    if os.path.exists(f):
        print("found - {}".format(f))
    else:
        print("not found - {}".format(f))


# ### Pull depth and coverage stats out of the vcf files

# Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later).

# In[10]:


import collections
sim_loc_cov = collections.OrderedDict()
sim_snp_cov = collections.OrderedDict()
sim_sample_nsnps = collections.OrderedDict()
sim_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
    try:
        print("Doing - {}".format(prog))
        print("  {}".format(filename))
        v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
        c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)

        sim_snp_cov[prog] = snp_coverage(c)
        sim_sample_nsnps[prog] = sample_nsnps(c)
        sim_loc_cov[prog] = loci_coverage(v, c, prog)
        sim_sample_nlocs[prog] = sample_nloci(v, c, prog)
    except Exception as inst:
        print(inst)


# ### Print out the results
# This isn't very helpful, but you get an idea of what's going on. The ddocent results for locus coverage and number of loci per sample are ugly because for reference sequence mapping the ddocent output vcf doesn't record this information. It only retains CHROM and POS, CHROM being "MT" here and POS being the base position of each snp.

# In[11]:


for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
             "sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items():

    for prog in sim_vcf_dict.keys():
        try:
            print(prog + " " + statname + "\t"),
            print(stat[prog ]),
            print(np.mean(stat[prog]))
        except:
            print("No {} stats for {}".format(statname, prog))
    print("------------------------------------------------------")


# ## Pairwise difference and PCA plots for each simulation treatment

# In[275]:


## Load the calldata into a dict so we don't have to keep loading and reloading it
calldata = {}
for prog in sim_vcf_dict.keys():
    print("{}".format(prog)),
    print("{}".format(sim_vcf_dict[prog]))
    c = vcfnp.calldata_2d(sim_vcf_dict[prog], verbose=False).view(np.recarray)
    calldata[prog] = c


# ### Set sample names and populations for nice plotting
# Some housekeeping with sample names to make the PCA plots prettier

# In[276]:


pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}

flt = np.in1d(np.array(sim_sample_names), pop1)
print(flt)


# In[278]:


f, axarr = plt.subplots(2,2, figsize=(10,10), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(assemblers, axarr):

    coords1, model1 = getPCA(calldata[prog])

    x = coords1[:, 0]
    y = coords1[:, 1]

    ax.scatter(x, y, marker='o')
    ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
    ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))

    for pop in pops.keys():
        flt = np.in1d(np.array(sim_sample_names), pops[pop])
        ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)

    ax.set_title(prog, style="italic")
    ax.axison = True
f.tight_layout()


# ### Plot pairwise distances for each assembler and each simulated datatype

# In[279]:


f, axarr = plt.subplots(2, 2, figsize=(10, 10), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(assemblers, axarr):

    ## Calculate pairwise distances
    dist = getDistances(calldata[prog])

    ## Doing it this way works, but allel uses imshow internally which rasterizes the image
    #allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False)

    ## Create the pcolormesh by hand
    dat = ensure_square(dist)
        
    ## for some reason np.flipud(dat) is chopping off one row of data
    p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
    cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
    ## Clip all heatmaps to actual sample size
    p.axes.axis("tight")

    ax.set_title(prog, style="italic")
    ax.axison = False


# # Go through and pull in more fine grained results
# What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them *_full_* to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample.
# 

# In[9]:


## Blank the ordered dicts for gathering locus coverage and sample nlocs
sim_full_loc_cov = collections.OrderedDict()
sim_full_sample_nlocs = collections.OrderedDict()


# ## ipyrad simulated results

# In[13]:


assembly_methods = {"ipyrad-reference":"refmap-sim", "ipyrad-denovo_reference":"denovo_ref-sim"}

for name, method in assembly_methods.items():
    print(method)
    simdir = os.path.join(IPYRAD_SIM_DIR, method + "_outfiles/")
    statsfile = simdir + "{}_stats.txt".format(method)
    infile = open(statsfile).readlines()
    sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]]
    print(sample_coverage)
    print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
    print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
    sim_full_sample_nlocs[name] = sample_coverage
    
    nmissing = [int(x.strip().split()[1]) for x in infile[38:50]]
    sim_full_loc_cov[name] = nmissing

## Just look at the ones we care about for ipyrad
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())


# ## stacks simulated results

# In[14]:


method = "stacks"
simdir = STACKS_SIM_DIR
try:

    lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines()
    cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
    sim_full_loc_cov[method] = [cnts.count(i) for i in range(1,13)]
except Exception as inst:
    print("loc_cov - {} - {}".format(inst, simdir))

try:
    sim_full_sample_nlocs[method] = []
    samp_haps = glob.glob("{}/*matches*".format(simdir))
    for f in samp_haps:
        lines = gzip.open(f).readlines()
        sim_full_sample_nlocs[method].append(len(lines) - 1)
except Exception as inst:
    print("sample_nlocs - {} - {}".format(inst, simdir))

print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())


# ## dDocent simulated results
# It is not straightforward how to get locus coverage and nloci per sample information from ddocent results. See the manuscript-analysis notebook for more explanation about why.

# In[10]:


DDOCENT_DIR = "/home/iovercast/manuscript-analysis/dDocent/"

def ddocent_stats(RUNDIR):
    ## This is hackish. You have to dip into the bam file to get locus counts that
    ## include counts for monomorphics. Note! that field 3 of the bam file is not
    ## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3') 
    ## the read maps to. So really this is kind of by proxy counting the number of loci.
    sample_coverage = []
    for samp in glob.glob(RUNDIR + "/*-RG.bam"):
        cmd = "{}samtools view {} | cut -f 4 | uniq | wc -l".format(DDOCENT_DIR, samp)
        res = subprocess.check_output(cmd, shell=True)
        sample_coverage.append(int(res.strip()))
    print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.")
    print("mean sample coverage - {}".format(np.mean(sample_coverage)))
    print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))

    cmd = "{}samtools view {}cat-RRG.bam | cut -f 3,4 | uniq".format(DDOCENT_DIR, RUNDIR)
    locus_positions = subprocess.check_output(cmd, shell=True).split("\n")
    
    locus_counter = collections.Counter()    # Num loci per sample
    coverage_counter = collections.Counter() # Coverage per locus
    for loc in locus_positions:
        if not loc:
            continue
        chrom = loc.split()[0]
        pos = int(loc.split()[1])
        cmd = "{}samtools view {}cat-RRG.bam {}:{}-{} | cut -f 1 | cut -f 3 -d \"_\" | sort | uniq".format(DDOCENT_DIR, RUNDIR, chrom, pos, pos+1)
        res = subprocess.check_output(cmd, shell=True).split()
        locus_counter.update(res)
        coverage_counter.update([len(res)])
        ## Fill in zero values that aren't in the locus counter
        dat = []
        for i in xrange(1,13):
            try:
                dat.append(coverage_counter[i])
            except:
                dat.append(0)

    return dat, locus_counter.values()

loc_cov, sample_nlocs = ddocent_stats(DDOCENT_SIM_DIR)

sim_full_loc_cov["ddocent"] = loc_cov
sim_full_sample_nlocs["ddocent"] = sample_nlocs
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())


# ## Plotting simulation results

# In[313]:


simlevel = ["simulated"]
sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel)

for sim in simlevel:
    for ass in assemblers:
        simstring = ass
        sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring])
print("Mean number of loci recovered per sample.")
## Normalize all bins
dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float)
for sim in simlevel:
    scale = 1000
#    if "ipyrad-reference" == sim
    dat[sim] = dat[sim]/scale
    dat[sim]["ipyrad-reference"] *= 2
sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True)
print(sim_sample_nlocs_df)


# ### Much better! Do the spline plot version of the above plot
# It just looks nicer, makes more sense.
# 
# __NB__: This is broken for refmap.

# In[60]:


dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])


for sim in simlevel:
    for ass in assemblers:
        simstring = ass
        ## Normalize values so different sim sizes print the same
        max = 1000.
        
        newdat = sim_full_loc_cov[simstring]
        newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)]
        for i, val in enumerate(newdat):
            dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val]

g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", size=1)
g.map(plt.scatter, "bin", "simdata")
g.map(plt.plot, "bin", "simdata").add_legend()
#axs = g.axes
#for ax in axs:
#    ax.set_xlim(0,12.5)


# # Empirical Results (Phocoena)
# This section is incomplete. See below for empirical and simulated results that are combined.

# ## Process the vcf output from all the runs
# Here we'll pull together all the output vcf files. This takes a few minutes to run.

# In[123]:


emp_vcf_dict = {}
emp_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_REFMAP_DIR, "refmap-empirical_outfiles/refmap-empirical.vcf")
emp_vcf_dict["ipyrad-denovo_reference"] = os.path.join(IPYRAD_REFMAP_DIR, "denovo_ref-empirical_outfiles/denovo_ref-empirical.vcf")
emp_vcf_dict["stacks"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf")
emp_vcf_dict["ddocent"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")


# ## Read in vcf for each analysis and pull in coverage/depth stats

# In[124]:


emp_loc_cov = collections.OrderedDict()
emp_snp_cov = collections.OrderedDict()
emp_sample_nsnps = collections.OrderedDict()
emp_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in emp_vcf_dict.items():
    try:
        print("Doing - {}".format(prog))
        print("\t{}".format(filename))
        v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
        c = vcfnp.calldata_2d(filename).view(np.recarray)

        emp_loc_cov[prog] = loci_coverage(v, c, prog)
        emp_snp_cov[prog] = snp_coverage(c)
        emp_sample_nsnps[prog] = sample_nsnps(c)
        emp_sample_nlocs[prog] = sample_nloci(v, c, prog)
        
        plotPCA(c, prog)
        plotPairwiseDistance(c, prog)
    except Exception as inst:
        print(inst)


# In[608]:


for i in emp_sample_nlocs:
    print(i),
    print(emp_sample_nlocs[i])


# ## ipyrad Empirical results
# First we load in the variant info and call data for all the snps

# In[70]:


IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/")
IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt")

infile = open(IPYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))


# Read the biallelic vcf

# In[71]:


filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf")
# filename = vcf_dict["ipyrad"]
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)


# ### Distribution of snps along loci
# Getting variable sites and parsimony informative sites from the vcf is kind of annoying
# because all the programs export __slightly__ different formats, so you need to
# parse them in slightly different ways. There's a better way to do this for ipyrad
# but i figure i'll do it the same way for all of them so it's more clear what's happening.

# In[72]:


## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()

## Now have to massage this into the list of counts per site in increasing order 
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]

## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]

canvas, axes = SNP_position_plot(distvar, distpis)

## save fig
#toyplot.html.render(canvas, 'snp_positions.html')

canvas


# ## stacks empirical results

# In[12]:


STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/")
STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/")
STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/")
STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/")

#lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
#cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
#shigh = [cnts.count(i) for i in range(1,13)]


# In[13]:


filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)


# ### Distribution of snps along loci

# In[14]:


## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()

## Now have to massage this into the list of counts per site in increasing order 
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]

## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]

canvas, axes = SNP_position_plot(distvar, distpis)

## save fig
#toyplot.html.render(canvas, 'snp_positions.html')

canvas


# # SKIP to HERE
# # Do both simulated and empirical at the same time
# Could make the plots prettier

# In[349]:


vcf_dict = {}
vcf_dict["ipyrad-reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf")
vcf_dict["ipyrad-denovo_plus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf")
vcf_dict["ipyrad-denovo_minus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf")
vcf_dict["stacks-sim"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf")
vcf_dict["ddocent-tot-sim"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
vcf_dict["ddocent-fin-sim"] = os.path.join(DDOCENT_SIM_DIR, "Final.recode.snps.vcf.recode.vcf")
vcf_dict["ipyrad-reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/refmap-empirical_outfiles/refmap-empirical.vcf")
vcf_dict["ipyrad-denovo_plus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.vcf")
vcf_dict["ipyrad-denovo_plus_reference-095-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.vcf")
vcf_dict["ipyrad-denovo_minus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.vcf")
vcf_dict["stacks-empirical"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf")
vcf_dict["ddocent-fin-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "Final.recode.snps.vcf.recode.vcf")

# skipt the full ddocent vcf. It's huge and we know we don't want to use it.
#vcf_dict["ddocent-tot-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")

## Make sure we have all the vcf files
for k, f in vcf_dict.items():
    if os.path.exists(f):
        print("found - {}".format(f))
    else:
        print("not found - {}".format(f))


# ### Some magic to get samples and populations properly grouped and colored for pretty PCA plots
# There has got to be a better way to do this for the empirical at least. It seems super hax, but it works.

# In[56]:


## sim
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}

## empirical
emp_pops = {}
emp_sample_names = []
popfile = os.path.join(STACKS_REFMAP_DIR, "popmap.txt")
with open(popfile) as infile:
    lines = [l.strip().split() for l in infile.readlines()]
    emp_sample_names = [x[0] for x in lines]
    pops = set([x[1] for x in lines])
    for pop in pops: emp_pops[pop] = []
    for line in lines:
        p = line[1]
        s = line[0]
        emp_pops[p].append(s)
emp_pop_colors = {k:v for (k,v) in zip(emp_pops, list(matplotlib.colors.cnames))}

## Write out the samples to pop files for vcftools
for outdir, pop_dict in {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops}.items():
    for pop, samps in pop_dict.items():
        with open(outdir + pop + ".txt", "w") as outfile:
            outfile.write("\n".join(samps))


# In[351]:


## The order of the samples in the vcf file is wonky. Reorder them so they're sorted by population. This
## is roughly the order of populations from the manuscript.

## If you run this multiple times, you should re-run the previous cell that resets
## the vcf file to the original path, otherwise you get a bunch of *.reorder.reorder.vcf files.
order = ["WBS", "IS", "NOS", "SK1", "KB1", "BES2", "IBS"]

vcf_header = """##fileformat=VCFv4.1\n##fileDate=20161230\n##source="Stacks v1.42"\n##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">\n##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Allele Depth">\n##FORMAT=<ID=GL,Number=.,Type=Float,Description="Genotype Likelihood">\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"""
with open("/tmp/dummy.vcf", 'w') as outfile:
    outfile.write(vcf_header)
    for p in order:
        outfile.write("\t" + "\t".join(emp_pops[p]))
for k, f in vcf_dict.items():
    if "stacks" in k:
        ## The version of vcftools we have here doesn't like the newer version of vcf stacks
        ## writes, so we have to 'fix' the stacks vcf version
        print("Fixing stacks vcf file format - {}".format(k))
        shutil.copy2(f, f+".bak")
        with open(f) as infile:
            dat = infile.readlines()[1:]
        with open(f, 'w') as outfile:
            outfile.write("##fileformat=VCFv4.1\n")
            outfile.write("".join(dat))
    if "empirical" in k:
        print("reordering")
        vcftools_path = os.path.join(DDOCENT_DIR, "vcftools_0.1.11/perl/")
        os.chdir(vcftools_path)
        tmpvcf = "tmp.vcf"
        newvcf = f.rsplit(".", 1)[0] + ".reordered.vcf"
        print(newvcf)
        cmd = "perl vcf-shuffle-cols -t {} {} > {}".format("/tmp/dummy.vcf", f, tmpvcf)
        print(cmd)
        os.system(cmd)
        get_ipython().system('mv $tmpvcf $newvcf')
        ## Don't destroy the old one
        vcf_dict[k] = newvcf


# In[352]:


import collections
## Load the calldata into a dict so we don't have to keep loading and reloading it
## This can take several minutes for the large empirical datasets (like 20+ minutes)
all_calldata = {}
all_vardata = {}

## Dicts for tracking all the stats
loc_cov = collections.OrderedDict()
snp_cov = collections.OrderedDict()
samp_nsnps = collections.OrderedDict()
samp_nlocs = collections.OrderedDict()

for prog, filename in vcf_dict.items():
    try:
        print("Doing - {}\n  {}".format(prog, filename))
        v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
        c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
        all_calldata[prog] = c
        all_vardata[prog] = v
        
        snp_cov[prog] = snp_coverage(c)
        samp_nsnps[prog] = sample_nsnps(c)
        loc_cov[prog] = loci_coverage(v, c, prog)
        samp_nlocs[prog] = sample_nloci(v, c, prog)
    except Exception as inst:
        print(inst)


# In[353]:


for statname, stat in {"loc_cov":loc_cov, "snp_cov":snp_cov,\
             "sample_nsnps":samp_nsnps, "sample_nlocs":samp_nlocs}.items():
    for prog in vcf_dict.keys():
        try:
            print(prog + " " + statname + "\t"),
            print(stat[prog]),
            print(np.mean(stat[prog]))
        except:
            print("No {} stats for {}".format(statname, prog))
    print("------------------------------------------------------")


# ## Filter linked snps? Doesn't make a difference.

# In[354]:


f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000)
## Make a list out of the axes
axarr = [a for b in axarr for a in b]
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
         "ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]

for prog, ax in zip(progs, axarr):
    print(prog, ax)
    if "empirical" in prog:
        pop_colors = emp_pop_colors
        pops = emp_pops
        sample_names = emp_sample_names
    else:
        pop_colors = sim_pop_colors
        pops = sim_pops
        sample_names = sim_sample_names
    ## Don't die if some of the runs aren't complete
    try:
        coords1, model1 = getPCA(all_calldata[prog])
    except:
        continue
        
    x = coords1[:, 0]
    y = coords1[:, 1]

    ax.scatter(x, y, marker='o')
    ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
    ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))

    for pop in pops.keys():
        flt = np.in1d(np.array(sample_names), pops[pop])
        ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)

    ax.set_title(prog, style="italic")
    ax.axison = True

axarr[0].legend(frameon=True)
axarr[5].legend(loc='lower left', frameon=True)

f.tight_layout()


# In[355]:


f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000)
axarr = [a for b in axarr for a in b]
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
         "ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]

for prog, ax in zip(progs, axarr):
    print(prog, ax)
    try:
        ## Calculate pairwise distances
        dist = getDistances(all_calldata[prog])
    except:
        continue

    ## Create the pcolormesh by hand
    dat = ensure_square(dist)
        
    ## for some reason np.flipud(dat) is chopping off one row of data
    p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
    cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
    ## Clip all heatmaps to actual sample size
    p.axes.axis("tight")

    ax.set_title(prog, style="italic")
    ax.axison = False

## Adjust margins to make room for the colorbar
plt.subplots_adjust(left=0.05, bottom=0.1, right=0.8, top=0.9, wspace=0.2, hspace=0.3)

## Add the colorbar
cax = f.add_axes([0.87, 0.4, 0.03, 0.5])
cb1 = matplotlib.colorbar.ColorbarBase(cax, cmap="jet", orientation="vertical", ticks=([0,1]))
cb1.ax.set_yticklabels(['Similar', "Dissimilar"], weight="bold", rotation="vertical")


# ## Pull in finer grained stats for each assembler
# For ipyrad and stacks this is very quick, but the whole step is slow because ddocent empirical
# stats take _forever_ to obtain.

# In[357]:


## Blank the ordered dicts for gathering locus coverage and sample nlocs
all_full_loc_cov = collections.OrderedDict()
all_full_sample_nlocs = collections.OrderedDict()

## Mapping between 'pretty names' and directory names. This is only really useful for ipyrad
## where i named the directories a little silly. For stacks and ddocent this doesn't really do anything.
assembly_methods = {"ipyrad-reference-sim":"refmap-sim", "ipyrad-denovo_reference-sim":"denovo_ref-sim",\
                    "stacks-sim":"stacks-sim", "ipyrad-reference-empirical":"refmap-empirical",\
                    "stacks-empirical":"stacks-empirical", "ddocent-simulated":"ddocent-simulated",\
                    "ddocent-empirical":"ddocent-empirical",\
                    "ipyrad_denovo_minus_reference-sim":"denovo_minus_reference-sim",\
                    "ipyrad_denovo_plus_reference-sim":"denovo_plus_reference-sim",\
                    "ipyrad_denovo_minus_reference-empirical":"denovo_minus_reference",\
                    "ipyrad_denovo_plus_reference-empirical":"denovo_plus_reference-095"
                   }

for name, method in assembly_methods.items():
    print("Doing - {}".format(name))
    if "ipyrad" in name:
        outdir = IPYRAD_SIM_DIR
        firstsamp = 20
        lastsamp = 32
        if "empirical" in name:
            outdir = IPYRAD_REFMAP_DIR + "reference-assembly/"
            firstsamp = 20 #fixme
            lastsamp = 64 # fixme
        try:
            nsamps = lastsamp - firstsamp
            simdir = os.path.join(outdir, method + "_outfiles/")
            statsfile = simdir + "{}_stats.txt".format(method)
            infile = open(statsfile).readlines()
            sample_coverage = [int(x.strip().split()[1]) for x in infile[firstsamp:lastsamp]]
            print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
            print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
            all_full_sample_nlocs[name] = sample_coverage

            nmissing = [int(x.strip().split()[1]) for x in infile[lastsamp+6:lastsamp+6+nsamps]]
            all_full_loc_cov[name] = nmissing
        except Exception as inst:
            print(inst)
    elif "stacks" in name:
        outdir = STACKS_SIM_DIR
        nsamps = 12
        if "empirical" in name:
            outdir = STACKS_REFMAP_DIR
            nsamps = 44
        try:
            ## Effectively the same as thisjjjj
            ## cut -f 2 batch_1.haplotypes.tsv | sort -n | uniq -c | less
            lines = open("{}/batch_1.haplotypes.tsv".format(outdir)).readlines()
            cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
            all_full_loc_cov[method] = [cnts.count(i) for i in range(1,nsamps+1)]
        except Exception as inst:
            print("loc_cov - {} - {}".format(inst, simdir))

        try:
            all_full_sample_nlocs[method] = []
            ## This is actually number of loci
            ## cut -f 3 *matches | sort -n | uniq | wc -l
            
            ## Right now this is actually counting number of snps
            samp_haps = glob.glob("{}/*matches*".format(outdir))
            for f in samp_haps:
                lines = gzip.open(f).readlines()
                all_full_sample_nlocs[method].append(len(lines) - 1)
        except Exception as inst:
            print("sample_nlocs - {} - {}".format(inst, outdir))
#    elif "ddocent" in name:
#        outdir = DDOCENT_SIM_DIR
#        if "empirical" in name:
#            outdir = DDOCENT_REFMAP_DIR
#        loc_cov, sample_nlocs = ddocent_stats(outdir)
#        all_full_loc_cov[method] = loc_cov
#        all_full_sample_nlocs[method] = sample_nlocs
    else:
        print("wtf?")
    try:
        print(all_full_loc_cov[name])
        print(all_full_sample_nlocs[name])
    except:
        print("no data for - {}".format(name))
        pass
    print("Done - {}\n\n".format(name))
print(all_full_loc_cov.items())
print(all_full_sample_nlocs.items())


# ## Get pairwise fst for each population for simulated and empirical
# Using vcftools just to get the mean fst between each population pair.

# In[359]:


import itertools
## dict for storing fst values. This is a dict of dictionaries. The top level is for
## each assembly type and within each assembly type is a dict of pop pairs and the fst between them.
fsts = {}

## Get pairwise fst for each pop for sim and empirical
d = {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops}
for k, v in vcf_dict.items():
    if not os.path.exists(v):
        continue
    if "sim" in k:
        indir = REFMAP_SIM_DIR
        pop_dict = sim_pops
    else:
        indir = REFMAP_EMPIRICAL_DIR
        pop_dict = emp_pops
    os.chdir(indir)

    try:
        print("Doing - {}".format(k))
        fsts[k] = {}
        combinations = list(itertools.combinations(pop_dict, 2))
        for pair in combinations:
            pop1 = indir + pair[0] + ".txt"
            pop2 = indir + pair[1] + ".txt"
            vcftools_cmd = DDOCENT_DIR + "vcftools"
            cmd = "{} --vcf {} --weir-fst-pop {} --weir-fst-pop {} --out {}".format(vcftools_cmd, v, pop1, pop2, k)
            ret = subprocess.check_output(cmd, shell=True)
            ret = ret.split("\n")
            ## Get the mean fst from the output
            fst = [x for x in ret if "Weir and Cockerham mean" in x][0].split(": ")[1]
            weighted_fst = [x for x in ret if "Weir and Cockerham weighted" in x][0].split(": ")[1]
            print("-".join(pair), "{}/{}".format(fst, weighted_fst)), 
            fsts[k]["-".join(pair)] = "{}/{}".format(fst, weighted_fst)
        print("\n")
    except Exception as inst:
        print("Failed - {} - {}".format(k, inst))
        ## Allow it to fail if the vcf doesn't exist.
        pass


# In[360]:


from IPython.display import display
import pandas as pd

## Organize and pretty print the fsts dict

df_sim = pd.DataFrame(index=sim_pops.keys(), columns=sim_pops.keys(), dtype=str).fillna("")
df_emp = pd.DataFrame(index=emp_pops.keys(), columns=emp_pops.keys(), dtype=str).fillna("")
for assembler, all_data in fsts.items():
    print("{}".format(assembler))
    if "sim" in assembler:
        indir = REFMAP_SIM_DIR
        pop_dict = sim_pops
        df = df_sim
    else:
        indir = REFMAP_EMPIRICAL_DIR
        pop_dict = emp_pops
        df = df_emp
    ## Init the diagonal
    for p in pop_dict.keys():
        df[p][p] = ""
    for p, fst_data in all_data.items():
        #print(df)
        p1 = p.split("-")[0]
        p2 = p.split("-")[1]
        df[p1][p2] += " "+fst_data.split("/")[0][:6]
        df[p2][p1] += " "+fst_data.split("/")[1][:6]
pd.set_option('display.max_colwidth',800)
pd.set_option('display.width',80)
df_sim.style.set_properties(**{'max-width':'10px'})
display(df_sim)
display(df_emp)


# ## Quick and dirty raxml trees
# ipyrad and stacks both kindly write out phylip files, but ddocent doesn't, so we'll have to make one ourselves.

# In[177]:


get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', '## Install raxml\nmkdir $1/miniconda/src\ncd $1/miniconda/src\ngit clone https://github.com/stamatak/standard-RAxML.git\ncd standard-RAxML\nmake -f Makefile.PTHREADS.gcc\ncp raxml-PTHREADS $1/miniconda/bin\n')


# In[211]:


## Get this script to convert from vcf to phy
## This script wants python 3, so I had to go in and add the future import for
## print by hand. Add this on line 17:
##
## from __future__ import print_function
##
## Also, had to update lines 32-34 like so:
##
## iupac = {"AG": "R", "CT": "Y", "CG": "S", "AT": "W", "GT": "K", "AC": "M",
##          "CGT": "B", "AGT": "D", "ACT": "H", "ACG": "V", "ACGT": "N", "AA": "A",
##          "CC": "C", "GG": "G", "TT":"T", "GN":"N", "CN":"N", "AN":"N", "TN":"N"}
##          "NT":"N", "NC":"N", "NG":"N", "NA":"N", "NN":"N"}
## Also, the when we decompose the ddocent complex genotypes vcftools splits
## ./. data as '.', which VCF2phy.py kinda hates, so you have to add this piece
## of code on lines 75/75:
##
##    if genotype_string == ".":
##        return "N"
#!wget https://raw.githubusercontent.com/CoBiG2/RAD_Tools/master/VCF2phy.py

for k, v in vcf_dict.items():
    if "stacks" in k and "sim" in k:
        outphy = v.rsplit(".", 1)[0]
        print(outphy)
        cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy)
        ret = subprocess.check_output(cmd, shell=True)
        print(ret)
    if "ddocent" in k:
        ## Skip doing the big boy for now
        if "empirical" in k and "Total" in v:
            continue
        ## Do not add .phy to outphy, the python script does it for us.
        outphy = v.rsplit(".", 1)[0]
        print(outphy)
        cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy)
        ret = subprocess.check_output(cmd, shell=True)
        print(ret)


# ## Run raxml
# For simulated datasets raxml runs very fast (<20 seconds per phylip file).
# 
# For empirical it's much slower, maybe 1-2 days per assembler. Be sure to use the '.snps.phy' ipyrad file
# or else raxml runs _forever_.

# In[361]:


for d in [REFMAP_SIM_DIR, REFMAP_EMPIRICAL_DIR]:
    raxoutdir = d + "raxml_outdir"
    if not os.path.exists(raxoutdir):
        os.mkdir(raxoutdir)
emp_trees = {}
sim_trees = {}
for assembler, vcffile in vcf_dict.items():
    if "sim" in assembler:
        outdir = REFMAP_SIM_DIR + "raxml_outdir"
        ncores = 2
        out_trees = sim_trees
        physplit = 1
    else:
        outdir = REFMAP_EMPIRICAL_DIR + "raxml_outdir"
        ncores = 20
        out_trees = emp_trees
        physplit = 2
    if "ipyrad" in assembler:
        inputfile = vcffile.rsplit(".", physplit)[0] + ".snps.phy"
    if "stacks" in assembler:
        inputfile = vcffile.rsplit(".", 2)[0] + ".phylip"        
    if "ddocent" in assembler:
        inputfile = vcffile.rsplit(".", 1)[0] + ".phy"
    if not os.path.exists(inputfile):
        print("\nCan't find .phy or .phylip for - {}\n".format(inputfile.rsplit(".", 1)[0]))
    cmd = "{}raxmlHPC-PTHREADS -f a ".format(WORK_DIR + "/miniconda/bin/") \
            + " -T {} ".format(ncores) \
            + " -m GTRGAMMA " \
            + " -N 100 " \
            + " -x 12345 " \
            + " -p 54321 " \
            + " -n {} ".format(assembler) \
            + " -w {} ".format(outdir) \
            + " -s {}".format(inputfile)
    print(cmd)
    ## What's the difference?
    ## out_trees[assembler] = "{}/RAxML_bestTree.{}".format(outdir, assembler)
    out_trees[assembler] = "{}/RAxML_bipartitions.{}".format(outdir, assembler)
    #continue
    if "sim" in assembler:
        print(assembler)
        #!time $cmd
    if "empirical" in assembler:
        print(assembler)
        #!time $cmd


# In[368]:


#!conda install biopython
from Bio import Phylo

f, axarr = plt.subplots(1, 5, figsize=(20,8), dpi=1000)

## For more than 1 row uncomment this
#axarr = [a for b in axarr for a in b]

## Pop membership and colors per pop were estabilshed above during the PCA
#sim_sample_names = pop1 + pop2 + pop3
#sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
#sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
sim_colors_per_sample = {}
for samp in sim_sample_names:
    for pop in sim_pops:
        if samp in sim_pops[pop]:
            sim_colors_per_sample[samp] = sim_pop_colors[pop]

## Set them in order so the plot looks nice.
#progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
#         "ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]

outgroup = [{'name': taxon_name} for taxon_name in pop3]

for assembler, ax in zip(sim_trees.keys(), axarr):
    print(assembler, sim_trees[assembler], ax)
    tree = Phylo.read(sim_trees[assembler], 'newick')
    tree.ladderize()
    tree.root_with_outgroup(*outgroup)
    ## This could be cool but doesn't work
    Phylo.draw(tree, axes = ax, do_show=False, label_colors=sim_colors_per_sample)
    ax.set_title(assembler)
plt.tight_layout()


# # TESTING - Everything below here is crap

# In[367]:


del sim_trees['ddocent-tot-sim']
print(len(sim_trees))
print(sim_trees)


# In[ ]:


## Set empirical colors
emp_colors_per_sample = {}
for samp in emp_sample_names:
    for pop in emp_pops:
        if samp in emp_pops[pop]:
            emp_colors_per_sample[samp] = emp_pop_colors[pop]


# In[ ]:


import ete3


# In[ ]:


for prog in all_calldata.keys():
#for prog in ["ipyrad-reference-sim"]:
    print("Doing {}".format(prog))
    try:
        c = all_calldata[prog]
        v = all_vardata[prog]
    except:
        print("Failed to load for {}".format(prog))
        continue
    ## Get only parsimony informative sites
    ## Get T/F values for whether each genotype is ref or alt across all samples/loci
    is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
    ## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
    alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
    ## Create a T/F mask for snps that are informative
    only_pis = map(lambda x: x < 2, alt_counts)
    ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
    ## Also, compress() the masked array so we only actually see the pis
    pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()

    ## Now have to massage this into the list of counts per site in increasing order 
    ## of position across the locus
    distpis = Counter([int(x) for x in pis])
    #distpis = [x for x in sorted(counts.items())]

    ## Getting the distvar is easier
    distvar = Counter([int(x) for x in v.POS])
    #distvar = [x for x in sorted(counts.items())]

    canvas, axes = SNP_position_plot(prog, distvar, distpis)

    ## save fig
    #toyplot.html.render(canvas, 'snp_positions.html')

    canvas


# In[ ]:


## Checking out differences between 0.85 and 0.95 clustering threshold for ipyrad


# In[339]:


filename = "/home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference-0.95_outfiles/denovo_ref-empirical.vcf"
## Only need to load once
#myv = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
#myc = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)

f, axarr = plt.subplots(1, 2, figsize=(20,8), dpi=1000)
## Make a list out of the axes
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
         "ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
rundict = {"0.85":all_calldata["ipyrad-denovo_minus_reference-empirical"], "0.95":myc}

for prog, ax in zip(rundict.keys(), axarr):
    pop_colors = emp_pop_colors
    pops = emp_pops
    sample_names = emp_sample_names

    ## Don't die if some of the runs aren't complete
    try:
        coords1, model1 = getPCA(rundict[prog])
    except:
        continue
        
    x = coords1[:, 0]
    y = coords1[:, 1]

    ax.scatter(x, y, marker='o')
    ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
    ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))

    for pop in pops.keys():
        flt = np.in1d(np.array(sample_names), pops[pop])
        ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)

    ax.set_title(prog, style="italic")
    ax.axison = True

axarr[0].legend(frameon=True)


# In[329]:


def plot_ld(gn, title):
    m = allel.stats.rogers_huff_r(gn) ** 2
    ax = allel.plot.pairwise_ld(m)
    ax.set_title(title)
def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5):
    for i in range(n_iter):
        loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold)
        n = np.count_nonzero(loc_unlinked)
        n_remove = gn.shape[0] - n
        print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
        gn = gn.compress(loc_unlinked, axis=0)
    return gn


# In[293]:


g = allel.GenotypeArray(myc.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()


# In[303]:


gnu = ld_prune(gn, size=50, step=200, threshold=.1, n_iter=5)


# In[ ]:


coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')