I'm assuming here that you've already run the de novo assemblies and have already installed all the prereqs for analysing the results. If not run the ipyrad-manuscript-results.ipynb.
## Imports and working/output directories directories
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
plt.rcParams["figure.figsize"] = [12,9]
from collections import Counter
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
import toyplot
import toyplot.html ## toypot sublib for saving html plots
import pandas as pd
import numpy as np
import subprocess
import collections
import allel
import vcfnp
import shutil
import gzip
import glob
import os
from allel.util import * ## for ensure_square()
WORK_DIR="/home/iovercast/manuscript-analysis/"
## Simulation dirs
REFMAP_SIM_DIR = os.path.join(WORK_DIR, "REFMAP_SIM/")
IPYRAD_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ipyrad/reference-assembly/")
STACKS_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "stacks/")
DDOCENT_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ddocent/")
## A list of the simluated assembler names for indexing dicts
assemblers = ["ipyrad-reference", "ipyrad-denovo_reference", "stacks", "ddocent"]
## Empirical dirs
REFMAP_EMPIRICAL_DIR=os.path.join(WORK_DIR, "Phocoena_empirical/")
IPYRAD_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ipyrad/")
STACKS_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "stacks/")
DDOCENT_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ddocent/")
## LD pruning code from the scikit-allele cookbook
def plot_ld(gn, title):
m = allel.stats.rogers_huff_r(gn) ** 2
ax = allel.plot.pairwise_ld(m)
ax.set_title(title)
def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5):
for i in range(n_iter):
loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold)
n = np.count_nonzero(loc_unlinked)
n_remove = gn.shape[0] - n
print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
gn = gn.compress(loc_unlinked, axis=0)
return gn
def plotPCA(call_data, title):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)
sns.despine(ax=ax, offset=5)
x = coords1[:, 0]
y = coords1[:, 1]
## We know this works because the species_dict and the columns in the vcf
## are in the same order.
for sp in species:
flt = (np.array(species_dict.values()) == sp)
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold')
fig.tight_layout()
def getPCA(call_data):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
## Test w/ ld pruning. Doesn't appreciably change the results.
#gnu = ld_prune(gn)
#gn = gnu
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
return coords1, model1
## We don't actually use this function bcz the allele.plot.pairwise_distance()
## call returns a raster which looks nasty in print.
def plotPairwiseDistance(call_data, title):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
allel.plot.pairwise_distance(dist, labels=species_dict.keys())
def getDistances(call_data):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
return(dist)
## Inputs to this function are two Counters where keys
## are the base position and values are snp counts.
## The counter doesn't have to be sorted because we sort internally.
def SNP_position_plot(prog, distvar, distpis):
## The last position to consider
maxend = np.array(distvar.keys()).max()
## This does two things, first it sorts in increasing
## order. Second, it creates a count bin for any position
## without snps and sets the count to 0.
distvar = [distvar[x] for x in xrange(maxend)]
distpis = [distpis[x] for x in xrange(maxend)]
## set color theme
colormap = toyplot.color.Palette()
## make a canvas
canvas = toyplot.Canvas(width=800, height=300)
## make axes
axes = canvas.cartesian(xlabel="Position along RAD loci",
ylabel="N variables sites",
gutter=65)
axes.label.text = prog
## x-axis
axes.x.ticks.show = True
axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"}
axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"}
axes.x.ticks.below = 5
axes.x.ticks.above = 0
axes.x.domain.max = maxend
axes.x.ticks.locator = toyplot.locator.Explicit(
range(0, maxend, 5),
map(str, range(0, maxend, 5)))
## y-axis
axes.y.ticks.show=True
axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"}
axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"}
axes.y.ticks.below = 0
axes.y.ticks.above = 5
## add fill plots
x = np.arange(0, maxend)
f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites")
f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites")
## add a horizontal dashed line at the median Nsnps per site
axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"})
axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"})
return canvas, axes
import numpy_indexed as npi
## Get the number of samples with data at each snp
def snp_coverage(call_data):
snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."])
## Fill zero values
return [snp_counts[x] for x in xrange(1, np.array(snp_counts.keys()).max()+1)]
## Get the number of samples with data at each locus
def loci_coverage(var_data, call_data, assembler):
if "stacks" in assembler:
loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./."))
else:
loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./."))
counts_per_snp = []
for z in xrange(0, len(loci)):
counts_per_snp.append([x.sum() for x in loci[z][1]])
counts = collections.Counter([np.max(x) for x in counts_per_snp])
## Fill all zero values
return [counts[x] for x in xrange(1, np.array(counts.keys()).max()+1)]
## Get total number of snps per sample
def sample_nsnps(call_data):
return [x.sum() for x in call_data["GT"].T != "./."]
## Get total number of loci per sample
def sample_nloci(var_data, call_data, assembler):
if "stacks" in assembler:
locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")
else:
locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.")
by_locus = [x.T for x in locus_groups[1]]
by_sample = np.array([(x).any(axis=1) for x in by_locus])
return [x.sum() for x in by_sample.T]
## Make a new pandas dataframe for holding the coverage results
sim_vcf_dict = {}
sim_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf")
sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf")
sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf")
sim_vcf_dict["stacks"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf")
sim_vcf_dict["ddocent"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
## Make sure we have all the vcf files
for k, f in sim_vcf_dict.items():
if os.path.exists(f):
print("found - {}".format(f))
else:
print("not found - {}".format(f))
found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/TotalRawSNPs.snps.vcf.recode.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/refmap-sim_outfiles/refmap-sim.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/stacks/batch_1.vcf
Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later).
import collections
sim_loc_cov = collections.OrderedDict()
sim_snp_cov = collections.OrderedDict()
sim_sample_nsnps = collections.OrderedDict()
sim_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
try:
print("Doing - {}".format(prog))
print(" {}".format(filename))
v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
sim_snp_cov[prog] = snp_coverage(c)
sim_sample_nsnps[prog] = sample_nsnps(c)
sim_loc_cov[prog] = loci_coverage(v, c, prog)
sim_sample_nlocs[prog] = sample_nloci(v, c, prog)
except Exception as inst:
print(inst)
Doing - ddocent /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/TotalRawSNPs.snps.vcf.recode.vcf Doing - ipyrad-denovo_reference /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_ref-sim_outfiles/denovo_ref-sim.vcf Doing - ipyrad-reference /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/refmap-sim_outfiles/refmap-sim.vcf Doing - stacks /home/iovercast/manuscript-analysis/REFMAP_SIM/stacks/batch_1.vcf
This isn't very helpful, but you get an idea of what's going on. The ddocent results for locus coverage and number of loci per sample are ugly because for reference sequence mapping the ddocent output vcf doesn't record this information. It only retains CHROM and POS, CHROM being "MT" here and POS being the base position of each snp.
for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
"sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items():
for prog in sim_vcf_dict.keys():
try:
print(prog + " " + statname + "\t"),
print(stat[prog ]),
print(np.mean(stat[prog]))
except:
print("No {} stats for {}".format(statname, prog))
print("------------------------------------------------------")
ddocent sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] 0.0833333333333 ipyrad-denovo_reference sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000] 83.3333333333 ipyrad-reference sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500] 41.6666666667 stacks sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 992] 82.8333333333 ------------------------------------------------------ ddocent sim_sample_nlocs [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 1.0 ipyrad-denovo_reference sim_sample_nlocs [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000] 1000.0 ipyrad-reference sim_sample_nlocs [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500] 500.0 stacks sim_sample_nlocs [994, 993, 994, 993, 994, 994, 994, 994, 994, 994, 994, 994] 993.833333333 ------------------------------------------------------ ddocent sim_sample_nsnps [4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4838] 4839.83333333 ipyrad-denovo_reference sim_sample_nsnps [9541, 9541, 9540, 9541, 9540, 9541, 9541, 9541, 9541, 9541, 9541, 9541] 9540.83333333 ipyrad-reference sim_sample_nsnps [4776, 4776, 4775, 4776, 4775, 4776, 4776, 4776, 4776, 4776, 4776, 4776] 4775.83333333 stacks sim_sample_nsnps [4648, 4642, 4648, 4643, 4647, 4647, 4648, 4647, 4647, 4648, 4648, 4646] 4646.58333333 ------------------------------------------------------ ddocent sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4838] 403.333333333 ipyrad-denovo_reference sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 9539] 795.083333333 ipyrad-reference sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4774] 398.0 stacks sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 14, 4633] 387.333333333 ------------------------------------------------------
## Load the calldata into a dict so we don't have to keep loading and reloading it
calldata = {}
for prog in sim_vcf_dict.keys():
print("{}".format(prog)),
print("{}".format(sim_vcf_dict[prog]))
c = vcfnp.calldata_2d(sim_vcf_dict[prog], verbose=False).view(np.recarray)
calldata[prog] = c
ddocent /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/TotalRawSNPs.snps.vcf.recode.vcf ipyrad-denovo_reference /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_ref-sim_outfiles/denovo_ref-sim.vcf ipyrad-reference /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/refmap-sim_outfiles/refmap-sim.vcf stacks /home/iovercast/manuscript-analysis/REFMAP_SIM/stacks/batch_1.vcf
Some housekeeping with sample names to make the PCA plots prettier
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
flt = np.in1d(np.array(sim_sample_names), pop1)
print(flt)
[ True True True True False False False False False False False False]
f, axarr = plt.subplots(2,2, figsize=(10,10), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(assemblers, axarr):
coords1, model1 = getPCA(calldata[prog])
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sim_sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog, style="italic")
ax.axison = True
f.tight_layout()
f, axarr = plt.subplots(2, 2, figsize=(10, 10), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(assemblers, axarr):
## Calculate pairwise distances
dist = getDistances(calldata[prog])
## Doing it this way works, but allel uses imshow internally which rasterizes the image
#allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False)
## Create the pcolormesh by hand
dat = ensure_square(dist)
## for some reason np.flipud(dat) is chopping off one row of data
p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
## Clip all heatmaps to actual sample size
p.axes.axis("tight")
ax.set_title(prog, style="italic")
ax.axison = False
What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them full to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample.
## Blank the ordered dicts for gathering locus coverage and sample nlocs
sim_full_loc_cov = collections.OrderedDict()
sim_full_sample_nlocs = collections.OrderedDict()
assembly_methods = {"ipyrad-reference":"refmap-sim", "ipyrad-denovo_reference":"denovo_ref-sim"}
for name, method in assembly_methods.items():
print(method)
simdir = os.path.join(IPYRAD_SIM_DIR, method + "_outfiles/")
statsfile = simdir + "{}_stats.txt".format(method)
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]]
print(sample_coverage)
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
sim_full_sample_nlocs[name] = sample_coverage
nmissing = [int(x.strip().split()[1]) for x in infile[38:50]]
sim_full_loc_cov[name] = nmissing
## Just look at the ones we care about for ipyrad
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())
denovo_ref-sim [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000] mean sample coverage - 1000.0 min/max - 1000/1000 refmap-sim [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500] mean sample coverage - 500.0 min/max - 500/500 [('ipyrad-denovo_reference', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000]), ('ipyrad-reference', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500])] [('ipyrad-denovo_reference', [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]), ('ipyrad-reference', [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500])]
method = "stacks"
simdir = STACKS_SIM_DIR
try:
lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
sim_full_loc_cov[method] = [cnts.count(i) for i in range(1,13)]
except Exception as inst:
print("loc_cov - {} - {}".format(inst, simdir))
try:
sim_full_sample_nlocs[method] = []
samp_haps = glob.glob("{}/*matches*".format(simdir))
for f in samp_haps:
lines = gzip.open(f).readlines()
sim_full_sample_nlocs[method].append(len(lines) - 1)
except Exception as inst:
print("sample_nlocs - {} - {}".format(inst, simdir))
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())
[('ipyrad-denovo_reference', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000]), ('ipyrad-reference', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500]), ('stacks', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 997])] [('ipyrad-denovo_reference', [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]), ('ipyrad-reference', [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]), ('stacks', [1163, 1165, 1162, 1154, 1154, 1154, 1153, 1160, 1144, 1144, 1149, 1135])]
It is not straightforward how to get locus coverage and nloci per sample information from ddocent results. See the manuscript-analysis notebook for more explanation about why.
DDOCENT_DIR = "/home/iovercast/manuscript-analysis/dDocent/"
def ddocent_stats(RUNDIR):
## This is hackish. You have to dip into the bam file to get locus counts that
## include counts for monomorphics. Note! that field 3 of the bam file is not
## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3')
## the read maps to. So really this is kind of by proxy counting the number of loci.
sample_coverage = []
for samp in glob.glob(RUNDIR + "/*-RG.bam"):
cmd = "{}samtools view {} | cut -f 4 | uniq | wc -l".format(DDOCENT_DIR, samp)
res = subprocess.check_output(cmd, shell=True)
sample_coverage.append(int(res.strip()))
print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.")
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
cmd = "{}samtools view {}cat-RRG.bam | cut -f 3,4 | uniq".format(DDOCENT_DIR, RUNDIR)
locus_positions = subprocess.check_output(cmd, shell=True).split("\n")
locus_counter = collections.Counter() # Num loci per sample
coverage_counter = collections.Counter() # Coverage per locus
for loc in locus_positions:
if not loc:
continue
chrom = loc.split()[0]
pos = int(loc.split()[1])
cmd = "{}samtools view {}cat-RRG.bam {}:{}-{} | cut -f 1 | cut -f 3 -d \"_\" | sort | uniq".format(DDOCENT_DIR, RUNDIR, chrom, pos, pos+1)
res = subprocess.check_output(cmd, shell=True).split()
locus_counter.update(res)
coverage_counter.update([len(res)])
## Fill in zero values that aren't in the locus counter
dat = []
for i in xrange(1,13):
try:
dat.append(coverage_counter[i])
except:
dat.append(0)
return dat, locus_counter.values()
loc_cov, sample_nlocs = ddocent_stats(DDOCENT_SIM_DIR)
sim_full_loc_cov["ddocent"] = loc_cov
sim_full_sample_nlocs["ddocent"] = sample_nlocs
print(sim_full_loc_cov.items())
print(sim_full_sample_nlocs.items())
This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files. mean sample coverage - 1000.66666667 min/max - 1000/1002 [('ddocent', [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1007])] [('ddocent', [1007, 1007, 1007, 1007, 1007, 1007, 1007, 1008, 1008, 1007, 1007, 1007])]
simlevel = ["simulated"]
sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel)
for sim in simlevel:
for ass in assemblers:
simstring = ass
sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring])
print("Mean number of loci recovered per sample.")
## Normalize all bins
dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float)
for sim in simlevel:
scale = 1000
# if "ipyrad-reference" == sim
dat[sim] = dat[sim]/scale
dat[sim]["ipyrad-reference"] *= 2
sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True)
print(sim_sample_nlocs_df)
Mean number of loci recovered per sample. simulated ipyrad-reference 500 ipyrad-denovo_reference 1000 stacks 1153.08 ddocent 1007.17
It just looks nicer, makes more sense.
NB: This is broken for refmap.
dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])
for sim in simlevel:
for ass in assemblers:
simstring = ass
## Normalize values so different sim sizes print the same
max = 1000.
newdat = sim_full_loc_cov[simstring]
newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)]
for i, val in enumerate(newdat):
dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val]
g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", size=1)
g.map(plt.scatter, "bin", "simdata")
g.map(plt.plot, "bin", "simdata").add_legend()
#axs = g.axes
#for ax in axs:
# ax.set_xlim(0,12.5)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-60-630bba5c72ef> in <module>() 14 15 g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", size=1) ---> 16 g.map(plt.scatter, "bin", "simdata") 17 g.map(plt.plot, "bin", "simdata").add_legend() 18 #axs = g.axes /home/iovercast/opt/miniconda/lib/python2.7/site-packages/seaborn/axisgrid.pyc in map(self, func, *args, **kwargs) 729 730 # Finalize the annotations and layout --> 731 self._finalize_grid(args[:2]) 732 733 return self /home/iovercast/opt/miniconda/lib/python2.7/site-packages/seaborn/axisgrid.pyc in _finalize_grid(self, axlabels) 820 self.set_axis_labels(*axlabels) 821 self.set_titles() --> 822 self.fig.tight_layout() 823 824 def facet_axis(self, row_i, col_j): /home/iovercast/opt/miniconda/lib/python2.7/site-packages/matplotlib/figure.pyc in tight_layout(self, renderer, pad, h_pad, w_pad, rect) 1752 rect=rect) 1753 -> 1754 self.subplots_adjust(**kwargs) 1755 1756 /home/iovercast/opt/miniconda/lib/python2.7/site-packages/matplotlib/figure.pyc in subplots_adjust(self, *args, **kwargs) 1608 1609 """ -> 1610 self.subplotpars.update(*args, **kwargs) 1611 for ax in self.axes: 1612 if not isinstance(ax, SubplotBase): /home/iovercast/opt/miniconda/lib/python2.7/site-packages/matplotlib/figure.pyc in update(self, left, bottom, right, top, wspace, hspace) 224 if self.left >= self.right: 225 reset() --> 226 raise ValueError('left cannot be >= right') 227 228 if self.bottom >= self.top: ValueError: left cannot be >= right
This section is incomplete. See below for empirical and simulated results that are combined.
Here we'll pull together all the output vcf files. This takes a few minutes to run.
emp_vcf_dict = {}
emp_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_REFMAP_DIR, "refmap-empirical_outfiles/refmap-empirical.vcf")
emp_vcf_dict["ipyrad-denovo_reference"] = os.path.join(IPYRAD_REFMAP_DIR, "denovo_ref-empirical_outfiles/denovo_ref-empirical.vcf")
emp_vcf_dict["stacks"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf")
emp_vcf_dict["ddocent"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
emp_loc_cov = collections.OrderedDict()
emp_snp_cov = collections.OrderedDict()
emp_sample_nsnps = collections.OrderedDict()
emp_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in emp_vcf_dict.items():
try:
print("Doing - {}".format(prog))
print("\t{}".format(filename))
v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
emp_loc_cov[prog] = loci_coverage(v, c, prog)
emp_snp_cov[prog] = snp_coverage(c)
emp_sample_nsnps[prog] = sample_nsnps(c)
emp_sample_nlocs[prog] = sample_nloci(v, c, prog)
plotPCA(c, prog)
plotPairwiseDistance(c, prog)
except Exception as inst:
print(inst)
[vcfnp] 2016-12-29 22:07:19.464202 :: caching is disabled [vcfnp] 2016-12-29 22:07:19.464703 :: building array
Doing - ipyrad-denovo_reference /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/denovo_ref-empirical_outfiles/denovo_ref-empirical.vcf file not found: /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/denovo_ref-empirical_outfiles/denovo_ref-empirical.vcf Doing - ipyrad-reference /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/refmap-empirical_outfiles/refmap-empirical.vcf file not found: /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/refmap-empirical_outfiles/refmap-empirical.vcf Doing - stacks /home/iovercast/manuscript-analysis/Phocoena_empirical/stacks/batch_1.vcf
[vcfnp] 2016-12-29 22:07:22.543297 :: caching is disabled [vcfnp] 2016-12-29 22:07:22.543798 :: building array
global name 'species' is not defined
for i in emp_sample_nlocs:
print(i),
print(emp_sample_nlocs[i])
stacks_default [22215, 32074, 25377, 16640, 23557, 17426, 32172, 34354, 31333, 29809, 34792, 24791, 25256] ddocent_full [34816, 37668, 35523, 25583, 36635, 25751, 38051, 39179, 38946, 38744, 39209, 37612, 37240] pyrad [22826, 34866, 30179, 19710, 19076, 21977, 36830, 36774, 37212, 29720, 38012, 35350, 31155] ipyrad [18710, 28396, 24635, 15667, 15301, 17830, 29885, 29492, 30213, 24043, 30671, 29076, 25580] ddocent_filt [18640, 18904, 18745, 18471, 18713, 18553, 18969, 19050, 19096, 19068, 19086, 19182, 19164] stacks_ungapped [23103, 35494, 30792, 19377, 19547, 21469, 36663, 36949, 36703, 29789, 37968, 33544, 31931]
First we load in the variant info and call data for all the snps
IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/")
IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
[21163, 31863, 27902, 20852, 17779, 23008, 33289, 33697, 35765, 29184, 35096, 38144, 34746] mean sample coverage - 29422.1538462 min/max - 17779/38144
Read the biallelic vcf
filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf")
# filename = vcf_dict["ipyrad"]
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
[vcfnp] 2016-10-12 12:42:26.099077 :: caching is disabled [vcfnp] 2016-10-12 12:42:26.099836 :: building array [vcfnp] 2016-10-12 12:42:29.441796 :: caching is disabled [vcfnp] 2016-10-12 12:42:29.442748 :: building array
Getting variable sites and parsimony informative sites from the vcf is kind of annoying because all the programs export slightly different formats, so you need to parse them in slightly different ways. There's a better way to do this for ipyrad but i figure i'll do it the same way for all of them so it's more clear what's happening.
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/")
STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/")
STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/")
STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/")
#lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
#cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
#shigh = [cnts.count(i) for i in range(1,13)]
filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
[vcfnp] 2016-10-12 10:48:17.512074 :: caching is disabled [vcfnp] 2016-10-12 10:48:17.512879 :: building array [vcfnp] 2016-10-12 10:48:25.055213 :: caching is disabled [vcfnp] 2016-10-12 10:48:25.061273 :: building array
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
vcf_dict = {}
vcf_dict["ipyrad-reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf")
vcf_dict["ipyrad-denovo_plus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf")
vcf_dict["ipyrad-denovo_minus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf")
vcf_dict["stacks-sim"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf")
vcf_dict["ddocent-tot-sim"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
vcf_dict["ddocent-fin-sim"] = os.path.join(DDOCENT_SIM_DIR, "Final.recode.snps.vcf.recode.vcf")
vcf_dict["ipyrad-reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/refmap-empirical_outfiles/refmap-empirical.vcf")
vcf_dict["ipyrad-denovo_plus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.vcf")
vcf_dict["ipyrad-denovo_plus_reference-095-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.vcf")
vcf_dict["ipyrad-denovo_minus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.vcf")
vcf_dict["stacks-empirical"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf")
vcf_dict["ddocent-fin-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "Final.recode.snps.vcf.recode.vcf")
# skipt the full ddocent vcf. It's huge and we know we don't want to use it.
#vcf_dict["ddocent-tot-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf")
## Make sure we have all the vcf files
for k, f in vcf_dict.items():
if os.path.exists(f):
print("found - {}".format(f))
else:
print("not found - {}".format(f))
found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/Final.recode.snps.vcf.recode.vcf found - /home/iovercast/manuscript-analysis/Phocoena_empirical/stacks/batch_1.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf found - /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.vcf not found - /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/stacks/batch_1.vcf found - /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/refmap-empirical_outfiles/refmap-empirical.vcf found - /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.vcf found - /home/iovercast/manuscript-analysis/Phocoena_empirical/ddocent/Final.recode.snps.vcf.recode.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/TotalRawSNPs.snps.vcf.recode.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/refmap-sim_outfiles/refmap-sim.vcf found - /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf
There has got to be a better way to do this for the empirical at least. It seems super hax, but it works.
## sim
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
## empirical
emp_pops = {}
emp_sample_names = []
popfile = os.path.join(STACKS_REFMAP_DIR, "popmap.txt")
with open(popfile) as infile:
lines = [l.strip().split() for l in infile.readlines()]
emp_sample_names = [x[0] for x in lines]
pops = set([x[1] for x in lines])
for pop in pops: emp_pops[pop] = []
for line in lines:
p = line[1]
s = line[0]
emp_pops[p].append(s)
emp_pop_colors = {k:v for (k,v) in zip(emp_pops, list(matplotlib.colors.cnames))}
## Write out the samples to pop files for vcftools
for outdir, pop_dict in {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops}.items():
for pop, samps in pop_dict.items():
with open(outdir + pop + ".txt", "w") as outfile:
outfile.write("\n".join(samps))
## The order of the samples in the vcf file is wonky. Reorder them so they're sorted by population. This
## is roughly the order of populations from the manuscript.
## If you run this multiple times, you should re-run the previous cell that resets
## the vcf file to the original path, otherwise you get a bunch of *.reorder.reorder.vcf files.
order = ["WBS", "IS", "NOS", "SK1", "KB1", "BES2", "IBS"]
vcf_header = """##fileformat=VCFv4.1\n##fileDate=20161230\n##source="Stacks v1.42"\n##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">\n##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Allele Depth">\n##FORMAT=<ID=GL,Number=.,Type=Float,Description="Genotype Likelihood">\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"""
with open("/tmp/dummy.vcf", 'w') as outfile:
outfile.write(vcf_header)
for p in order:
outfile.write("\t" + "\t".join(emp_pops[p]))
for k, f in vcf_dict.items():
if "stacks" in k:
## The version of vcftools we have here doesn't like the newer version of vcf stacks
## writes, so we have to 'fix' the stacks vcf version
print("Fixing stacks vcf file format - {}".format(k))
shutil.copy2(f, f+".bak")
with open(f) as infile:
dat = infile.readlines()[1:]
with open(f, 'w') as outfile:
outfile.write("##fileformat=VCFv4.1\n")
outfile.write("".join(dat))
if "empirical" in k:
print("reordering")
vcftools_path = os.path.join(DDOCENT_DIR, "vcftools_0.1.11/perl/")
os.chdir(vcftools_path)
tmpvcf = "tmp.vcf"
newvcf = f.rsplit(".", 1)[0] + ".reordered.vcf"
print(newvcf)
cmd = "perl vcf-shuffle-cols -t {} {} > {}".format("/tmp/dummy.vcf", f, tmpvcf)
print(cmd)
os.system(cmd)
!mv $tmpvcf $newvcf
## Don't destroy the old one
vcf_dict[k] = newvcf
Fixing stacks vcf file format - stacks-empirical reordering /home/iovercast/manuscript-analysis/Phocoena_empirical/stacks/batch_1.reordered.vcf perl vcf-shuffle-cols -t /tmp/dummy.vcf /home/iovercast/manuscript-analysis/Phocoena_empirical/stacks/batch_1.vcf > tmp.vcf reordering /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.reordered.vcf perl vcf-shuffle-cols -t /tmp/dummy.vcf /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.vcf > tmp.vcf reordering /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.reordered.vcf perl vcf-shuffle-cols -t /tmp/dummy.vcf /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.vcf > tmp.vcf Fixing stacks vcf file format - stacks-sim reordering /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/refmap-empirical_outfiles/refmap-empirical.reordered.vcf perl vcf-shuffle-cols -t /tmp/dummy.vcf /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/refmap-empirical_outfiles/refmap-empirical.vcf > tmp.vcf reordering /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.reordered.vcf perl vcf-shuffle-cols -t /tmp/dummy.vcf /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.vcf > tmp.vcf reordering /home/iovercast/manuscript-analysis/Phocoena_empirical/ddocent/Final.recode.snps.vcf.recode.reordered.vcf perl vcf-shuffle-cols -t /tmp/dummy.vcf /home/iovercast/manuscript-analysis/Phocoena_empirical/ddocent/Final.recode.snps.vcf.recode.vcf > tmp.vcf
import collections
## Load the calldata into a dict so we don't have to keep loading and reloading it
## This can take several minutes for the large empirical datasets (like 20+ minutes)
all_calldata = {}
all_vardata = {}
## Dicts for tracking all the stats
loc_cov = collections.OrderedDict()
snp_cov = collections.OrderedDict()
samp_nsnps = collections.OrderedDict()
samp_nlocs = collections.OrderedDict()
for prog, filename in vcf_dict.items():
try:
print("Doing - {}\n {}".format(prog, filename))
v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
all_calldata[prog] = c
all_vardata[prog] = v
snp_cov[prog] = snp_coverage(c)
samp_nsnps[prog] = sample_nsnps(c)
loc_cov[prog] = loci_coverage(v, c, prog)
samp_nlocs[prog] = sample_nloci(v, c, prog)
except Exception as inst:
print(inst)
Doing - ddocent-fin-sim /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/Final.recode.snps.vcf.recode.vcf Doing - stacks-empirical /home/iovercast/manuscript-analysis/Phocoena_empirical/stacks/batch_1.reordered.vcf Doing - ipyrad-denovo_minus_reference-sim /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf Doing - ipyrad-denovo_plus_reference-095-empirical /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.reordered.vcf Doing - ipyrad-denovo_plus_reference-empirical /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.reordered.vcf basic_string::substr: __pos (which is 18446744073709551615) > this->size() (which is 0) Doing - stacks-sim /home/iovercast/manuscript-analysis/REFMAP_SIM/stacks/batch_1.vcf Doing - ipyrad-reference-empirical /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/refmap-empirical_outfiles/refmap-empirical.reordered.vcf Doing - ipyrad-denovo_minus_reference-empirical /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.reordered.vcf Doing - ddocent-fin-empirical /home/iovercast/manuscript-analysis/Phocoena_empirical/ddocent/Final.recode.snps.vcf.recode.reordered.vcf Doing - ddocent-tot-sim /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/TotalRawSNPs.snps.vcf.recode.vcf Doing - ipyrad-reference-sim /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/refmap-sim_outfiles/refmap-sim.vcf Doing - ipyrad-denovo_plus_reference-sim /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf
for statname, stat in {"loc_cov":loc_cov, "snp_cov":snp_cov,\
"sample_nsnps":samp_nsnps, "sample_nlocs":samp_nlocs}.items():
for prog in vcf_dict.keys():
try:
print(prog + " " + statname + "\t"),
print(stat[prog]),
print(np.mean(stat[prog]))
except:
print("No {} stats for {}".format(statname, prog))
print("------------------------------------------------------")
ddocent-fin-sim sample_nlocs [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 1.0 stacks-empirical sample_nlocs [78511, 81962, 82414, 94151, 92146, 104316, 94099, 83252, 93109, 88319, 92008, 45195, 36564, 37502, 97311, 97110, 96071, 98059, 86883, 79414, 97308, 92421, 70530, 85021, 81766, 91929, 87529, 95557, 87105, 65463, 93372, 84925, 46207, 98960, 78322, 91547, 98216, 57176, 26729, 68932, 83942, 110102, 50223, 81625] 81438.7045455 ipyrad-denovo_minus_reference-sim sample_nlocs [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500] 500.0 ipyrad-denovo_plus_reference-095-empirical sample_nlocs [63041, 67885, 69761, 78541, 76574, 86265, 80607, 69913, 78769, 73461, 77077, 35786, 27485, 25629, 83193, 82596, 82414, 78615, 72002, 66468, 81868, 78135, 57845, 69802, 66292, 78313, 71215, 81069, 75198, 51989, 78737, 72108, 36670, 83460, 59866, 77037, 83158, 44554, 18856, 56152, 70249, 94988, 36984, 67930] 67467.2045455 ipyrad-denovo_plus_reference-empirical sample_nlocs No sample_nlocs stats for ipyrad-denovo_plus_reference-empirical stacks-sim sample_nlocs [994, 993, 994, 993, 994, 994, 994, 994, 994, 994, 994, 994] 993.833333333 ipyrad-reference-empirical sample_nlocs [31030, 33423, 33693, 37242, 36927, 39491, 38589, 31674, 37857, 35265, 37421, 18602, 14897, 13209, 39165, 38832, 39065, 36030, 35081, 32160, 38806, 37707, 28368, 34076, 29465, 37217, 33201, 38057, 35794, 25000, 37581, 32377, 18832, 38576, 28166, 35939, 39433, 22276, 8975, 27358, 34143, 43137, 17783, 32452] 32144.8181818 ipyrad-denovo_minus_reference-empirical sample_nlocs [30955, 33398, 34961, 39454, 37735, 44195, 40145, 35820, 38694, 36374, 38090, 17081, 12767, 12827, 41997, 41755, 41141, 40125, 35241, 33158, 40890, 38561, 28434, 33972, 34554, 39038, 36141, 40999, 37785, 25871, 38854, 37800, 17141, 42406, 30533, 39180, 41602, 21621, 9625, 28101, 34350, 48947, 18864, 33920] 33752.3181818 ddocent-fin-empirical sample_nlocs [12408, 12421, 12410, 12444, 12438, 12446, 12450, 12424, 12449, 12429, 12439, 12348, 12262, 12235, 12447, 12445, 12456, 12456, 12422, 12432, 12450, 12446, 12400, 12433, 12423, 12451, 12438, 12459, 12442, 12396, 12440, 12434, 12151, 12456, 12412, 12452, 12453, 12362, 12086, 12418, 12434, 12461, 12337, 12422] 12407.2045455 ddocent-tot-sim sample_nlocs [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 1.0 ipyrad-reference-sim sample_nlocs [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500] 500.0 ipyrad-denovo_plus_reference-sim sample_nlocs [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000] 1000.0 ------------------------------------------------------ ddocent-fin-sim sample_nsnps [4722, 4722, 4722, 4722, 4722, 4722, 4722, 4722, 4722, 4722, 4722, 4720] 4721.83333333 stacks-empirical sample_nsnps [127584, 132382, 132461, 151871, 147852, 166338, 150813, 137488, 150082, 142916, 147395, 75863, 61779, 63115, 155295, 155563, 155042, 158655, 140520, 129483, 156638, 149322, 115267, 138933, 136090, 148625, 142290, 154049, 141251, 109132, 150738, 139130, 77554, 158810, 127177, 148688, 157457, 94969, 48569, 113856, 136906, 175552, 84501, 133347] 132303.363636 ipyrad-denovo_minus_reference-sim sample_nsnps [4765, 4765, 4765, 4765, 4765, 4765, 4765, 4765, 4765, 4765, 4765, 4765] 4765.0 ipyrad-denovo_plus_reference-095-empirical sample_nsnps [137220, 146783, 149325, 169235, 165873, 185539, 173775, 153670, 170248, 159153, 166580, 77835, 59846, 56056, 179128, 177628, 177653, 170445, 156419, 143632, 176078, 168548, 125526, 151154, 146456, 169491, 154770, 174765, 162339, 114529, 170677, 156547, 79561, 180467, 131269, 167086, 178488, 97828, 42991, 122642, 152317, 203178, 81442, 147304] 146170.363636 ipyrad-denovo_plus_reference-empirical sample_nsnps No sample_nsnps stats for ipyrad-denovo_plus_reference-empirical stacks-sim sample_nsnps [4648, 4642, 4648, 4643, 4647, 4647, 4648, 4647, 4647, 4648, 4648, 4646] 4646.58333333 ipyrad-reference-empirical sample_nsnps [68134, 73041, 72841, 81281, 81035, 86323, 84391, 70165, 83389, 77755, 82062, 40749, 32574, 28963, 85446, 84542, 85338, 79213, 77265, 70319, 84859, 82520, 62281, 74722, 65372, 81397, 73122, 82985, 77986, 55170, 82551, 70629, 41240, 84238, 62253, 78628, 86080, 49126, 20083, 59985, 74760, 94244, 39378, 71100] 70443.9772727 ipyrad-denovo_minus_reference-empirical sample_nsnps [71317, 77190, 79627, 90622, 87474, 101779, 92398, 84924, 88994, 83694, 88136, 39198, 29297, 29794, 96825, 96303, 94582, 93041, 81786, 76200, 93959, 88312, 65666, 78307, 82013, 90542, 83851, 94569, 86947, 60858, 89912, 88383, 39578, 98424, 71052, 91057, 95441, 50570, 24554, 65478, 79379, 112805, 43932, 78162] 78112.0909091 ddocent-fin-empirical sample_nsnps [171685, 172201, 171522, 172909, 173155, 173637, 173266, 172177, 173278, 172838, 173170, 168267, 165354, 166545, 173392, 173405, 173517, 173424, 172597, 172531, 173419, 173298, 171246, 172721, 172256, 173361, 173086, 173363, 172955, 170887, 173079, 172319, 161239, 173552, 172418, 173079, 173517, 168600, 159573, 171618, 172605, 173755, 169918, 172199] 171657.568182 ddocent-tot-sim sample_nsnps [4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4840, 4838] 4839.83333333 ipyrad-reference-sim sample_nsnps [4776, 4776, 4775, 4776, 4775, 4776, 4776, 4776, 4776, 4776, 4776, 4776] 4775.83333333 ipyrad-denovo_plus_reference-sim sample_nsnps [9541, 9541, 9540, 9541, 9540, 9541, 9541, 9541, 9541, 9541, 9541, 9541] 9540.83333333 ------------------------------------------------------ ddocent-fin-sim snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4720] 393.5 stacks-empirical snp_cov [4332, 4871, 4460, 4037, 3726, 3595, 3412, 3352, 3475, 3433, 3462, 3505, 3495, 3582, 3618, 3742, 3772, 3742, 3834, 4201, 4199, 4501, 4519, 4811, 4995, 5290, 5531, 5858, 6115, 6392, 6910, 7214, 7428, 7672, 8174, 8440, 8524, 8645, 8752, 8328, 7460, 6158, 4035, 2142] 5175.88636364 ipyrad-denovo_minus_reference-sim snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4765] 397.083333333 ipyrad-denovo_plus_reference-095-empirical snp_cov [197, 351, 1457, 8374, 7055, 6189, 5742, 5437, 5295, 5155, 4906, 4728, 4597, 4706, 4725, 4665, 4807, 4678, 4897, 4976, 4995, 5352, 5479, 5441, 5918, 5902, 6178, 6198, 6658, 6864, 7191, 7438, 7973, 8232, 8723, 9050, 9462, 9790, 9673, 8969, 7716, 6075, 3534, 1374] 5843.68181818 ipyrad-denovo_plus_reference-empirical snp_cov No snp_cov stats for ipyrad-denovo_plus_reference-empirical stacks-sim snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 14, 4633] 387.333333333 ipyrad-reference-empirical snp_cov [131, 148, 758, 5298, 4204, 3597, 3113, 3337, 2944, 2915, 2616, 2586, 2573, 2579, 2504, 2615, 2632, 2652, 2655, 2737, 2724, 2895, 3033, 3009, 3184, 3147, 3405, 3384, 3497, 3763, 3788, 3886, 4078, 4220, 4124, 4407, 4529, 4176, 4180, 3670, 2849, 1863, 808, 193] 2986.5 ipyrad-denovo_minus_reference-empirical snp_cov [123, 296, 1219, 4780, 4096, 3341, 2934, 2490, 2570, 2584, 2448, 2294, 2018, 2236, 2142, 2234, 2235, 2231, 2193, 2208, 2286, 2387, 2345, 2330, 2755, 2669, 2857, 2988, 3104, 3285, 3366, 3644, 3878, 4128, 4391, 4881, 5290, 5592, 5693, 5686, 5376, 4327, 2735, 1034] 3038.61363636 ddocent-fin-empirical snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6619, 8821, 13380, 27196, 118071] 3956.52272727 ddocent-tot-sim snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4838] 403.333333333 ipyrad-reference-sim snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4774] 398.0 ipyrad-denovo_plus_reference-sim snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 9539] 795.083333333 ------------------------------------------------------ ddocent-fin-sim loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] 0.0833333333333 stacks-empirical loc_cov [2315, 2636, 2465, 2274, 2075, 1895, 1842, 1835, 1896, 1871, 1903, 1912, 1833, 1996, 1968, 2040, 2062, 2060, 2149, 2331, 2322, 2503, 2394, 2642, 2709, 3033, 3098, 3281, 3454, 3666, 3943, 4205, 4336, 4622, 5001, 5071, 5396, 5561, 5752, 5702, 5209, 4431, 2970, 1637] 3052.18181818 ipyrad-denovo_minus_reference-sim loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500] 41.6666666667 ipyrad-denovo_plus_reference-095-empirical loc_cov [26, 86, 513, 4305, 3689, 3128, 2931, 2738, 2654, 2551, 2453, 2388, 2287, 2257, 2260, 2314, 2344, 2214, 2260, 2357, 2362, 2413, 2516, 2512, 2585, 2750, 2794, 2838, 2975, 3183, 3205, 3354, 3574, 3690, 3900, 3962, 4286, 4480, 4409, 4175, 3624, 2844, 1671, 645] 2738.68181818 ipyrad-denovo_plus_reference-empirical loc_cov No loc_cov stats for ipyrad-denovo_plus_reference-empirical stacks-sim loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 992] 82.8333333333 ipyrad-reference-empirical loc_cov [10, 42, 264, 2029, 1776, 1563, 1407, 1471, 1252, 1349, 1262, 1278, 1203, 1212, 1215, 1226, 1239, 1218, 1219, 1290, 1257, 1300, 1390, 1359, 1438, 1371, 1505, 1582, 1534, 1668, 1684, 1750, 1799, 1893, 1874, 1946, 2024, 1924, 1935, 1716, 1361, 906, 415, 99] 1346.70454545 ipyrad-denovo_minus_reference-empirical loc_cov [9, 37, 250, 1530, 1381, 1253, 1191, 1003, 1011, 1063, 1032, 958, 893, 927, 889, 933, 969, 876, 880, 896, 952, 1012, 1023, 999, 1132, 1134, 1185, 1233, 1276, 1404, 1496, 1518, 1673, 1773, 1836, 2024, 2296, 2488, 2531, 2558, 2436, 1995, 1302, 485] 1266.86363636 ddocent-fin-empirical loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 171, 240, 360, 826, 10870] 283.340909091 ddocent-tot-sim loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] 0.0833333333333 ipyrad-reference-sim loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500] 41.6666666667 ipyrad-denovo_plus_reference-sim loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000] 83.3333333333 ------------------------------------------------------
f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000)
## Make a list out of the axes
axarr = [a for b in axarr for a in b]
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
"ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
for prog, ax in zip(progs, axarr):
print(prog, ax)
if "empirical" in prog:
pop_colors = emp_pop_colors
pops = emp_pops
sample_names = emp_sample_names
else:
pop_colors = sim_pop_colors
pops = sim_pops
sample_names = sim_sample_names
## Don't die if some of the runs aren't complete
try:
coords1, model1 = getPCA(all_calldata[prog])
except:
continue
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog, style="italic")
ax.axison = True
axarr[0].legend(frameon=True)
axarr[5].legend(loc='lower left', frameon=True)
f.tight_layout()
('ipyrad-reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa170410>) ('ipyrad-denovo_plus_reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba9bf26d0>) ('ipyrad-denovo_minus_reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba9dca250>) ('stacks-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa1c25d0>) ('ddocent-fin-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa1b8550>) ('ipyrad-reference-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa17b690>) ('ipyrad-denovo_plus_reference-095-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba9bc8c90>) ('ipyrad-denovo_minus_reference-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba9d7ab90>) ('stacks-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf81e1850>) ('ddocent-fin-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf811c350>)
f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000)
axarr = [a for b in axarr for a in b]
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
"ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
for prog, ax in zip(progs, axarr):
print(prog, ax)
try:
## Calculate pairwise distances
dist = getDistances(all_calldata[prog])
except:
continue
## Create the pcolormesh by hand
dat = ensure_square(dist)
## for some reason np.flipud(dat) is chopping off one row of data
p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
## Clip all heatmaps to actual sample size
p.axes.axis("tight")
ax.set_title(prog, style="italic")
ax.axison = False
## Adjust margins to make room for the colorbar
plt.subplots_adjust(left=0.05, bottom=0.1, right=0.8, top=0.9, wspace=0.2, hspace=0.3)
## Add the colorbar
cax = f.add_axes([0.87, 0.4, 0.03, 0.5])
cb1 = matplotlib.colorbar.ColorbarBase(cax, cmap="jet", orientation="vertical", ticks=([0,1]))
cb1.ax.set_yticklabels(['Similar', "Dissimilar"], weight="bold", rotation="vertical")
('ipyrad-reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba0a6bc50>) ('ipyrad-denovo_plus_reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba9ca01d0>) ('ipyrad-denovo_minus_reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa1e0210>) ('stacks-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa00e050>) ('ddocent-fin-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf90f04d0>) ('ipyrad-reference-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf9498d50>) ('ipyrad-denovo_plus_reference-095-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf964d210>) ('ipyrad-denovo_minus_reference-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf95a6090>) ('stacks-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf91a7d10>) ('ddocent-fin-empirical', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabf94c00d0>)
[<matplotlib.text.Text at 0x2aabf9202750>, <matplotlib.text.Text at 0x2aabf9228750>]
For ipyrad and stacks this is very quick, but the whole step is slow because ddocent empirical stats take forever to obtain.
## Blank the ordered dicts for gathering locus coverage and sample nlocs
all_full_loc_cov = collections.OrderedDict()
all_full_sample_nlocs = collections.OrderedDict()
## Mapping between 'pretty names' and directory names. This is only really useful for ipyrad
## where i named the directories a little silly. For stacks and ddocent this doesn't really do anything.
assembly_methods = {"ipyrad-reference-sim":"refmap-sim", "ipyrad-denovo_reference-sim":"denovo_ref-sim",\
"stacks-sim":"stacks-sim", "ipyrad-reference-empirical":"refmap-empirical",\
"stacks-empirical":"stacks-empirical", "ddocent-simulated":"ddocent-simulated",\
"ddocent-empirical":"ddocent-empirical",\
"ipyrad_denovo_minus_reference-sim":"denovo_minus_reference-sim",\
"ipyrad_denovo_plus_reference-sim":"denovo_plus_reference-sim",\
"ipyrad_denovo_minus_reference-empirical":"denovo_minus_reference",\
"ipyrad_denovo_plus_reference-empirical":"denovo_plus_reference-095"
}
for name, method in assembly_methods.items():
print("Doing - {}".format(name))
if "ipyrad" in name:
outdir = IPYRAD_SIM_DIR
firstsamp = 20
lastsamp = 32
if "empirical" in name:
outdir = IPYRAD_REFMAP_DIR + "reference-assembly/"
firstsamp = 20 #fixme
lastsamp = 64 # fixme
try:
nsamps = lastsamp - firstsamp
simdir = os.path.join(outdir, method + "_outfiles/")
statsfile = simdir + "{}_stats.txt".format(method)
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[firstsamp:lastsamp]]
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
all_full_sample_nlocs[name] = sample_coverage
nmissing = [int(x.strip().split()[1]) for x in infile[lastsamp+6:lastsamp+6+nsamps]]
all_full_loc_cov[name] = nmissing
except Exception as inst:
print(inst)
elif "stacks" in name:
outdir = STACKS_SIM_DIR
nsamps = 12
if "empirical" in name:
outdir = STACKS_REFMAP_DIR
nsamps = 44
try:
## Effectively the same as thisjjjj
## cut -f 2 batch_1.haplotypes.tsv | sort -n | uniq -c | less
lines = open("{}/batch_1.haplotypes.tsv".format(outdir)).readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
all_full_loc_cov[method] = [cnts.count(i) for i in range(1,nsamps+1)]
except Exception as inst:
print("loc_cov - {} - {}".format(inst, simdir))
try:
all_full_sample_nlocs[method] = []
## This is actually number of loci
## cut -f 3 *matches | sort -n | uniq | wc -l
## Right now this is actually counting number of snps
samp_haps = glob.glob("{}/*matches*".format(outdir))
for f in samp_haps:
lines = gzip.open(f).readlines()
all_full_sample_nlocs[method].append(len(lines) - 1)
except Exception as inst:
print("sample_nlocs - {} - {}".format(inst, outdir))
# elif "ddocent" in name:
# outdir = DDOCENT_SIM_DIR
# if "empirical" in name:
# outdir = DDOCENT_REFMAP_DIR
# loc_cov, sample_nlocs = ddocent_stats(outdir)
# all_full_loc_cov[method] = loc_cov
# all_full_sample_nlocs[method] = sample_nlocs
else:
print("wtf?")
try:
print(all_full_loc_cov[name])
print(all_full_sample_nlocs[name])
except:
print("no data for - {}".format(name))
pass
print("Done - {}\n\n".format(name))
print(all_full_loc_cov.items())
print(all_full_sample_nlocs.items())
Doing - stacks-empirical [44367, 19212, 12649, 9571, 7961, 6694, 5997, 5625, 5210, 5136, 4765, 4653, 4400, 4499, 4465, 4347, 4215, 4264, 4418, 4501, 4596, 4603, 4712, 4891, 5055, 5402, 5737, 6002, 6479, 6757, 7147, 7807, 8294, 9141, 9957, 10916, 11935, 12668, 14520, 15065, 15006, 13576, 9744, 6215] [321973, 346480, 312450, 353761, 406143, 333570, 294514, 327705, 342850, 335347, 351034, 330976, 279661, 345149, 331976, 216198, 307288, 327835, 312977, 364989, 231650, 360533, 344603, 288889, 374219, 339563, 251650, 295388, 427019, 315171, 353471, 336082, 343184, 208747, 331309, 352628, 363792, 180628, 318019, 255977, 362677, 310417, 390315, 238011] Done - stacks-empirical Doing - stacks-sim [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 997] [1163, 1165, 1162, 1154, 1154, 1154, 1153, 1160, 1144, 1144, 1149, 1135] Done - stacks-sim Doing - ddocent-simulated wtf? no data for - ddocent-simulated Done - ddocent-simulated Doing - ipyrad_denovo_plus_reference-sim mean sample coverage - 1000.0 min/max - 1000/1000 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000] [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000] Done - ipyrad_denovo_plus_reference-sim Doing - ipyrad-reference-empirical mean sample coverage - 42086.5454545 min/max - 11632/57548 [0, 0, 0, 4583, 3594, 2891, 2498, 2377, 2121, 1997, 1997, 1854, 1740, 1743, 1669, 1665, 1673, 1708, 1658, 1692, 1625, 1647, 1806, 1777, 1845, 1757, 1873, 2008, 1958, 2067, 2067, 2188, 2241, 2395, 2309, 2420, 2505, 2453, 2485, 2302, 1762, 1176, 550, 128] [48906, 44535, 19258, 24128, 48757, 47354, 51413, 51102, 17452, 51418, 49376, 36918, 43844, 45688, 41963, 51154, 44456, 50704, 47028, 49916, 49499, 32401, 38001, 40358, 43418, 48760, 24686, 42630, 44362, 57548, 11632, 35946, 51905, 28789, 50550, 47005, 22935, 42131, 36510, 48135, 52287, 46016, 41187, 49747] Done - ipyrad-reference-empirical Doing - ipyrad_denovo_minus_reference-empirical mean sample coverage - 44729.1363636 min/max - 12169/66795 [0, 0, 0, 3475, 2742, 2341, 2149, 1850, 1751, 1620, 1597, 1474, 1368, 1353, 1315, 1345, 1345, 1254, 1259, 1262, 1277, 1347, 1365, 1349, 1428, 1468, 1524, 1579, 1648, 1756, 1878, 1901, 2113, 2292, 2363, 2595, 2882, 3220, 3252, 3325, 3122, 2660, 1770, 668] [52585, 46861, 16552, 22199, 50332, 53355, 55004, 55919, 16871, 56252, 51256, 37338, 44169, 46266, 43945, 54736, 44832, 56661, 50327, 54647, 51706, 33459, 45152, 40692, 47577, 51807, 22323, 50525, 45469, 66795, 12169, 37028, 55712, 27879, 53469, 51871, 24418, 44710, 39742, 49729, 59472, 48047, 46877, 51347] Done - ipyrad_denovo_minus_reference-empirical Doing - ipyrad_denovo_minus_reference-sim mean sample coverage - 500.0 min/max - 500/500 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500] [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500] Done - ipyrad_denovo_minus_reference-sim Doing - ipyrad-reference-sim mean sample coverage - 500.0 min/max - 500/500 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500] [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500] Done - ipyrad-reference-sim Doing - ipyrad-denovo_reference-sim mean sample coverage - 1000.0 min/max - 1000/1000 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000] [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000] Done - ipyrad-denovo_reference-sim Doing - ddocent-empirical wtf? no data for - ddocent-empirical Done - ddocent-empirical Doing - ipyrad_denovo_plus_reference-empirical mean sample coverage - 89181.8863636 min/max - 24088/128700 [0, 0, 0, 9569, 7261, 5876, 5220, 4712, 4302, 4016, 3829, 3580, 3355, 3223, 3232, 3234, 3272, 3094, 3073, 3174, 3115, 3149, 3328, 3290, 3360, 3502, 3664, 3594, 3782, 3934, 4103, 4218, 4491, 4726, 4895, 5055, 5399, 5720, 5668, 5479, 4654, 3705, 2230, 848] [104327, 93135, 35643, 46544, 101562, 104417, 109738, 110064, 33754, 110746, 103584, 75765, 89784, 94538, 87661, 109230, 91956, 111073, 99810, 107757, 104609, 67453, 86352, 82809, 93855, 103568, 48108, 95916, 92502, 128700, 24088, 73980, 110872, 57707, 106908, 101847, 47662, 89130, 77879, 100780, 115635, 96817, 91406, 104332] Done - ipyrad_denovo_plus_reference-empirical [('stacks-empirical', [44367, 19212, 12649, 9571, 7961, 6694, 5997, 5625, 5210, 5136, 4765, 4653, 4400, 4499, 4465, 4347, 4215, 4264, 4418, 4501, 4596, 4603, 4712, 4891, 5055, 5402, 5737, 6002, 6479, 6757, 7147, 7807, 8294, 9141, 9957, 10916, 11935, 12668, 14520, 15065, 15006, 13576, 9744, 6215]), ('stacks-sim', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 997]), ('ipyrad_denovo_plus_reference-sim', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000]), ('ipyrad-reference-empirical', [0, 0, 0, 4583, 3594, 2891, 2498, 2377, 2121, 1997, 1997, 1854, 1740, 1743, 1669, 1665, 1673, 1708, 1658, 1692, 1625, 1647, 1806, 1777, 1845, 1757, 1873, 2008, 1958, 2067, 2067, 2188, 2241, 2395, 2309, 2420, 2505, 2453, 2485, 2302, 1762, 1176, 550, 128]), ('ipyrad_denovo_minus_reference-empirical', [0, 0, 0, 3475, 2742, 2341, 2149, 1850, 1751, 1620, 1597, 1474, 1368, 1353, 1315, 1345, 1345, 1254, 1259, 1262, 1277, 1347, 1365, 1349, 1428, 1468, 1524, 1579, 1648, 1756, 1878, 1901, 2113, 2292, 2363, 2595, 2882, 3220, 3252, 3325, 3122, 2660, 1770, 668]), ('ipyrad_denovo_minus_reference-sim', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500]), ('ipyrad-reference-sim', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 500]), ('ipyrad-denovo_reference-sim', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000]), ('ipyrad_denovo_plus_reference-empirical', [0, 0, 0, 9569, 7261, 5876, 5220, 4712, 4302, 4016, 3829, 3580, 3355, 3223, 3232, 3234, 3272, 3094, 3073, 3174, 3115, 3149, 3328, 3290, 3360, 3502, 3664, 3594, 3782, 3934, 4103, 4218, 4491, 4726, 4895, 5055, 5399, 5720, 5668, 5479, 4654, 3705, 2230, 848])] [('stacks-empirical', [321973, 346480, 312450, 353761, 406143, 333570, 294514, 327705, 342850, 335347, 351034, 330976, 279661, 345149, 331976, 216198, 307288, 327835, 312977, 364989, 231650, 360533, 344603, 288889, 374219, 339563, 251650, 295388, 427019, 315171, 353471, 336082, 343184, 208747, 331309, 352628, 363792, 180628, 318019, 255977, 362677, 310417, 390315, 238011]), ('stacks-sim', [1163, 1165, 1162, 1154, 1154, 1154, 1153, 1160, 1144, 1144, 1149, 1135]), ('ipyrad_denovo_plus_reference-sim', [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]), ('ipyrad-reference-empirical', [48906, 44535, 19258, 24128, 48757, 47354, 51413, 51102, 17452, 51418, 49376, 36918, 43844, 45688, 41963, 51154, 44456, 50704, 47028, 49916, 49499, 32401, 38001, 40358, 43418, 48760, 24686, 42630, 44362, 57548, 11632, 35946, 51905, 28789, 50550, 47005, 22935, 42131, 36510, 48135, 52287, 46016, 41187, 49747]), ('ipyrad_denovo_minus_reference-empirical', [52585, 46861, 16552, 22199, 50332, 53355, 55004, 55919, 16871, 56252, 51256, 37338, 44169, 46266, 43945, 54736, 44832, 56661, 50327, 54647, 51706, 33459, 45152, 40692, 47577, 51807, 22323, 50525, 45469, 66795, 12169, 37028, 55712, 27879, 53469, 51871, 24418, 44710, 39742, 49729, 59472, 48047, 46877, 51347]), ('ipyrad_denovo_minus_reference-sim', [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]), ('ipyrad-reference-sim', [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]), ('ipyrad-denovo_reference-sim', [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]), ('ipyrad_denovo_plus_reference-empirical', [104327, 93135, 35643, 46544, 101562, 104417, 109738, 110064, 33754, 110746, 103584, 75765, 89784, 94538, 87661, 109230, 91956, 111073, 99810, 107757, 104609, 67453, 86352, 82809, 93855, 103568, 48108, 95916, 92502, 128700, 24088, 73980, 110872, 57707, 106908, 101847, 47662, 89130, 77879, 100780, 115635, 96817, 91406, 104332])]
Using vcftools just to get the mean fst between each population pair.
import itertools
## dict for storing fst values. This is a dict of dictionaries. The top level is for
## each assembly type and within each assembly type is a dict of pop pairs and the fst between them.
fsts = {}
## Get pairwise fst for each pop for sim and empirical
d = {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops}
for k, v in vcf_dict.items():
if not os.path.exists(v):
continue
if "sim" in k:
indir = REFMAP_SIM_DIR
pop_dict = sim_pops
else:
indir = REFMAP_EMPIRICAL_DIR
pop_dict = emp_pops
os.chdir(indir)
try:
print("Doing - {}".format(k))
fsts[k] = {}
combinations = list(itertools.combinations(pop_dict, 2))
for pair in combinations:
pop1 = indir + pair[0] + ".txt"
pop2 = indir + pair[1] + ".txt"
vcftools_cmd = DDOCENT_DIR + "vcftools"
cmd = "{} --vcf {} --weir-fst-pop {} --weir-fst-pop {} --out {}".format(vcftools_cmd, v, pop1, pop2, k)
ret = subprocess.check_output(cmd, shell=True)
ret = ret.split("\n")
## Get the mean fst from the output
fst = [x for x in ret if "Weir and Cockerham mean" in x][0].split(": ")[1]
weighted_fst = [x for x in ret if "Weir and Cockerham weighted" in x][0].split(": ")[1]
print("-".join(pair), "{}/{}".format(fst, weighted_fst)),
fsts[k]["-".join(pair)] = "{}/{}".format(fst, weighted_fst)
print("\n")
except Exception as inst:
print("Failed - {} - {}".format(k, inst))
## Allow it to fail if the vcf doesn't exist.
pass
Doing - ddocent-fin-sim ('pop3-pop2', '0.17486/0.4497') ('pop3-pop1', '0.17204/0.45056') ('pop2-pop1', '0.12088/0.33013') Doing - stacks-empirical ('KB1-SK1', '-0.01716/0.0033872') ('KB1-WBS', '0.076575/0.19859') ('KB1-IS', '-0.018957/0.01089') ('KB1-BES2', '-0.012275/0.0067543') ('KB1-IBS', '-0.014075/0.0045256') ('KB1-NOS', '-0.010345/0.014532') ('SK1-WBS', '0.081251/0.19717') ('SK1-IS', '-0.02681/0.0029578') ('SK1-BES2', '-0.0074286/0.013508') ('SK1-IBS', '-0.012173/0.008697') ('SK1-NOS', '-0.018733/0.0054864') ('WBS-IS', '0.11216/0.21698') ('WBS-BES2', '0.059879/0.19404') ('WBS-IBS', '0.059164/0.18995') ('WBS-NOS', '0.086069/0.20544') ('IS-BES2', '-0.0093259/0.026361') ('IS-IBS', '-0.016882/0.01746') ('IS-NOS', '-0.02235/0.0066823') ('BES2-IBS', '-0.0087638/0.0076244') ('BES2-NOS', '-0.0032056/0.024965') ('IBS-NOS', '-0.0087328/0.018391') Doing - ipyrad-denovo_minus_reference-sim ('pop3-pop2', '0.1668/0.4316') ('pop3-pop1', '0.17018/0.44182') ('pop2-pop1', '0.12268/0.32948') Doing - ipyrad-denovo_plus_reference-095-empirical ('KB1-SK1', '-0.023798/0.003303') ('KB1-WBS', '0.077415/0.23887') ('KB1-IS', '-0.025438/0.013112') ('KB1-BES2', '-0.01869/0.007567') ('KB1-IBS', '-0.019823/0.0054943') ('KB1-NOS', '-0.015872/0.017019') ('SK1-WBS', '0.083859/0.23675') ('SK1-IS', '-0.03404/0.0021198') ('SK1-BES2', '-0.013266/0.017269') ('SK1-IBS', '-0.017577/0.010677') ('SK1-NOS', '-0.025239/0.0049364') ('WBS-IS', '0.12264/0.26421') ('WBS-BES2', '0.05149/0.22469') ('WBS-IBS', '0.055395/0.22431') ('WBS-NOS', '0.089508/0.24784') ('IS-BES2', '-0.018112/0.029512') ('IS-IBS', '-0.023608/0.020038') ('IS-NOS', '-0.027994/0.0095543') ('BES2-IBS', '-0.014136/0.0096868') ('BES2-NOS', '-0.0078227/0.030793') ('IBS-NOS', '-0.01415/0.020284') Doing - ipyrad-denovo_plus_reference-empirical Failed - ipyrad-denovo_plus_reference-empirical - list index out of range Doing - stacks-sim ('pop3-pop2', '0.17523/0.45058') ('pop3-pop1', '0.17249/0.45133') ('pop2-pop1', '0.12079/0.33051') Doing - ipyrad-reference-empirical ('KB1-SK1', '-0.024338/0.0068613') ('KB1-WBS', '0.061695/0.21921') ('KB1-IS', '-0.029979/0.014001') ('KB1-BES2', '-0.022308/0.0044536') ('KB1-IBS', '-0.023808/0.0026994') ('KB1-NOS', '-0.015705/0.02254') ('SK1-WBS', '0.072674/0.22078') ('SK1-IS', '-0.03403/0.0091718') ('SK1-BES2', '-0.018931/0.01958') ('SK1-IBS', '-0.022596/0.012753') ('SK1-NOS', '-0.025798/0.010899') ('WBS-IS', '0.10708/0.24609') ('WBS-BES2', '0.040545/0.21383') ('WBS-IBS', '0.040374/0.20428') ('WBS-NOS', '0.074959/0.23353') ('IS-BES2', '-0.024111/0.031663') ('IS-IBS', '-0.031303/0.017557') ('IS-NOS', '-0.032434/0.0094771') ('BES2-IBS', '-0.017717/0.011511') ('BES2-NOS', '-0.011882/0.031635') ('IBS-NOS', '-0.017731/0.020207') Doing - ipyrad-denovo_minus_reference-empirical ('KB1-SK1', '-0.024644/0.0036036') ('KB1-WBS', '0.089713/0.25488') ('KB1-IS', '-0.023546/0.014123') ('KB1-BES2', '-0.022548/0.0026067') ('KB1-IBS', '-0.022381/0.0043561') ('KB1-NOS', '-0.017197/0.014578') ('SK1-WBS', '0.088756/0.24235') ('SK1-IS', '-0.033035/0.0047607') ('SK1-BES2', '-0.01184/0.019395') ('SK1-IBS', '-0.015345/0.014307') ('SK1-NOS', '-0.025557/0.0073795') ('WBS-IS', '0.12954/0.27219') ('WBS-BES2', '0.056838/0.22986') ('WBS-IBS', '0.06109/0.23065') ('WBS-NOS', '0.099081/0.25648') ('IS-BES2', '-0.016472/0.032934') ('IS-IBS', '-0.021272/0.026445') ('IS-NOS', '-0.027781/0.0079985') ('BES2-IBS', '-0.015618/0.010091') ('BES2-NOS', '-0.010779/0.027077') ('IBS-NOS', '-0.014946/0.021999') Doing - ddocent-fin-empirical ('KB1-SK1', '-0.001962/0.0029912') ('KB1-WBS', '0.085547/0.19715') ('KB1-IS', '-0.0017929/0.0090684') ('KB1-BES2', '0.00027415/0.0029637') ('KB1-IBS', '5.1773e-05/0.0033916') ('KB1-NOS', '0.0038877/0.011107') ('SK1-WBS', '0.089487/0.19613') ('SK1-IS', '-0.0076227/0.0021813') ('SK1-BES2', '0.0073208/0.01205') ('SK1-IBS', '0.0026361/0.0062348') ('SK1-NOS', '-0.00049069/0.0053676') ('WBS-IS', '0.12857/0.2216') ('WBS-BES2', '0.073132/0.19152') ('WBS-IBS', '0.067364/0.18469') ('WBS-NOS', '0.083301/0.19202') ('IS-BES2', '0.0085377/0.019996') ('IS-IBS', '0.0012964/0.011622') ('IS-NOS', '-0.0037559/0.0062206') ('BES2-IBS', '0.0014986/0.0040149') ('BES2-NOS', '0.011913/0.019716') ('IBS-NOS', '0.0082616/0.015119') Doing - ddocent-tot-sim ('pop3-pop2', '0.17516/0.45038') ('pop3-pop1', '0.17211/0.45061') ('pop2-pop1', '0.12077/0.3301') Doing - ipyrad-reference-sim ('pop3-pop2', '0.17509/0.4503') ('pop3-pop1', '0.17189/0.45029') ('pop2-pop1', '0.12032/0.32919') Doing - ipyrad-denovo_plus_reference-sim ('pop3-pop2', '0.17095/0.44106') ('pop3-pop1', '0.17102/0.44598') ('pop2-pop1', '0.12149/0.32933')
from IPython.display import display
import pandas as pd
## Organize and pretty print the fsts dict
df_sim = pd.DataFrame(index=sim_pops.keys(), columns=sim_pops.keys(), dtype=str).fillna("")
df_emp = pd.DataFrame(index=emp_pops.keys(), columns=emp_pops.keys(), dtype=str).fillna("")
for assembler, all_data in fsts.items():
print("{}".format(assembler))
if "sim" in assembler:
indir = REFMAP_SIM_DIR
pop_dict = sim_pops
df = df_sim
else:
indir = REFMAP_EMPIRICAL_DIR
pop_dict = emp_pops
df = df_emp
## Init the diagonal
for p in pop_dict.keys():
df[p][p] = ""
for p, fst_data in all_data.items():
#print(df)
p1 = p.split("-")[0]
p2 = p.split("-")[1]
df[p1][p2] += " "+fst_data.split("/")[0][:6]
df[p2][p1] += " "+fst_data.split("/")[1][:6]
pd.set_option('display.max_colwidth',800)
pd.set_option('display.width',80)
df_sim.style.set_properties(**{'max-width':'10px'})
display(df_sim)
display(df_emp)
ddocent-fin-sim stacks-empirical ipyrad-reference-sim ipyrad-denovo_minus_reference-sim ipyrad-denovo_plus_reference-095-empirical ipyrad-denovo_plus_reference-empirical ipyrad-reference-empirical ipyrad-denovo_minus_reference-empirical ddocent-fin-empirical ddocent-tot-sim stacks-sim ipyrad-denovo_plus_reference-sim
pop3 | pop2 | pop1 | |
---|---|---|---|
pop3 | 0.4497 0.4503 0.4316 0.4503 0.4505 0.4410 | 0.4505 0.4502 0.4418 0.4506 0.4513 0.4459 | |
pop2 | 0.1748 0.1750 0.1668 0.1751 0.1752 0.1709 | 0.3301 0.3291 0.3294 0.3301 0.3305 0.3293 | |
pop1 | 0.1720 0.1718 0.1701 0.1721 0.1724 0.1710 | 0.1208 0.1203 0.1226 0.1207 0.1207 0.1214 |
KB1 | SK1 | WBS | IS | BES2 | IBS | NOS | |
---|---|---|---|---|---|---|---|
KB1 | 0.0033 0.0033 0.0068 0.0036 0.0029 | 0.1985 0.2388 0.2192 0.2548 0.1971 | 0.0108 0.0131 0.0140 0.0141 0.0090 | 0.0067 0.0075 0.0044 0.0026 0.0029 | 0.0045 0.0054 0.0026 0.0043 0.0033 | 0.0145 0.0170 0.0225 0.0145 0.0111 | |
SK1 | -0.017 -0.023 -0.024 -0.024 -0.001 | 0.1971 0.2367 0.2207 0.2423 0.1961 | 0.0029 0.0021 0.0091 0.0047 0.0021 | 0.0135 0.0172 0.0195 0.0193 0.0120 | 0.0086 0.0106 0.0127 0.0143 0.0062 | 0.0054 0.0049 0.0108 0.0073 0.0053 | |
WBS | 0.0765 0.0774 0.0616 0.0897 0.0855 | 0.0812 0.0838 0.0726 0.0887 0.0894 | 0.2169 0.2642 0.2460 0.2721 0.2216 | 0.1940 0.2246 0.2138 0.2298 0.1915 | 0.1899 0.2243 0.2042 0.2306 0.1846 | 0.2054 0.2478 0.2335 0.2564 0.1920 | |
IS | -0.018 -0.025 -0.029 -0.023 -0.001 | -0.026 -0.034 -0.034 -0.033 -0.007 | 0.1121 0.1226 0.1070 0.1295 0.1285 | 0.0263 0.0295 0.0316 0.0329 0.0199 | 0.0174 0.0200 0.0175 0.0264 0.0116 | 0.0066 0.0095 0.0094 0.0079 0.0062 | |
BES2 | -0.012 -0.018 -0.022 -0.022 0.0002 | -0.007 -0.013 -0.018 -0.011 0.0073 | 0.0598 0.0514 0.0405 0.0568 0.0731 | -0.009 -0.018 -0.024 -0.016 0.0085 | 0.0076 0.0096 0.0115 0.0100 0.0040 | 0.0249 0.0307 0.0316 0.0270 0.0197 | |
IBS | -0.014 -0.019 -0.023 -0.022 5.1773 | -0.012 -0.017 -0.022 -0.015 0.0026 | 0.0591 0.0553 0.0403 0.0610 0.0673 | -0.016 -0.023 -0.031 -0.021 0.0012 | -0.008 -0.014 -0.017 -0.015 0.0014 | 0.0183 0.0202 0.0202 0.0219 0.0151 | |
NOS | -0.010 -0.015 -0.015 -0.017 0.0038 | -0.018 -0.025 -0.025 -0.025 -0.000 | 0.0860 0.0895 0.0749 0.0990 0.0833 | -0.022 -0.027 -0.032 -0.027 -0.003 | -0.003 -0.007 -0.011 -0.010 0.0119 | -0.008 -0.014 -0.017 -0.014 0.0082 |
ipyrad and stacks both kindly write out phylip files, but ddocent doesn't, so we'll have to make one ourselves.
%%bash -s "$WORK_DIR"
## Install raxml
mkdir $1/miniconda/src
cd $1/miniconda/src
git clone https://github.com/stamatak/standard-RAxML.git
cd standard-RAxML
make -f Makefile.PTHREADS.gcc
cp raxml-PTHREADS $1/miniconda/bin
rm -f *.o raxmlHPC-PTHREADS-AVX2 gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o axml.o axml.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o optimizeModel.o optimizeModel.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o multiple.o multiple.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o searchAlgo.o searchAlgo.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o topologies.o topologies.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o parsePartitions.o parsePartitions.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o treeIO.o treeIO.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o models.o models.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o bipartitionList.o bipartitionList.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o rapidBootstrap.o rapidBootstrap.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o evaluatePartialGenericSpecial.o evaluatePartialGenericSpecial.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o evaluateGenericSpecial.o evaluateGenericSpecial.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o newviewGenericSpecial.o newviewGenericSpecial.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o makenewzGenericSpecial.o makenewzGenericSpecial.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o classify.o classify.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -mavx -c -o fastDNAparsimony.o fastDNAparsimony.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o fastSearch.o fastSearch.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o leaveDropping.o leaveDropping.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o rmqs.o rmqs.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o rogueEPA.o rogueEPA.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o ancestralStates.o ancestralStates.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -mavx2 -D_FMA -march=core-avx2 -c -o avxLikelihood.o avxLikelihood.c gcc -D_USE_PTHREADS -D__SIM_SSE3 -O2 -D_GNU_SOURCE -msse3 -fomit-frame-pointer -funroll-loops -D__AVX -c -o mem_alloc.o mem_alloc.c gcc -c -o eigen.o eigen.c gcc -o raxmlHPC-PTHREADS-AVX2 axml.o optimizeModel.o multiple.o searchAlgo.o topologies.o parsePartitions.o treeIO.o models.o bipartitionList.o rapidBootstrap.o evaluatePartialGenericSpecial.o evaluateGenericSpecial.o newviewGenericSpecial.o makenewzGenericSpecial.o classify.o fastDNAparsimony.o fastSearch.o leaveDropping.o rmqs.o rogueEPA.o ancestralStates.o avxLikelihood.o mem_alloc.o eigen.o -lm -pthread
mkdir: cannot create directory ‘/home/iovercast/manuscript-analysis//miniconda/src’: File exists fatal: destination path 'standard-RAxML' already exists and is not an empty directory.
## Get this script to convert from vcf to phy
## This script wants python 3, so I had to go in and add the future import for
## print by hand. Add this on line 17:
##
## from __future__ import print_function
##
## Also, had to update lines 32-34 like so:
##
## iupac = {"AG": "R", "CT": "Y", "CG": "S", "AT": "W", "GT": "K", "AC": "M",
## "CGT": "B", "AGT": "D", "ACT": "H", "ACG": "V", "ACGT": "N", "AA": "A",
## "CC": "C", "GG": "G", "TT":"T", "GN":"N", "CN":"N", "AN":"N", "TN":"N"}
## "NT":"N", "NC":"N", "NG":"N", "NA":"N", "NN":"N"}
## Also, the when we decompose the ddocent complex genotypes vcftools splits
## ./. data as '.', which VCF2phy.py kinda hates, so you have to add this piece
## of code on lines 75/75:
##
## if genotype_string == ".":
## return "N"
#!wget https://raw.githubusercontent.com/CoBiG2/RAD_Tools/master/VCF2phy.py
for k, v in vcf_dict.items():
if "stacks" in k and "sim" in k:
outphy = v.rsplit(".", 1)[0]
print(outphy)
cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy)
ret = subprocess.check_output(cmd, shell=True)
print(ret)
if "ddocent" in k:
## Skip doing the big boy for now
if "empirical" in k and "Total" in v:
continue
## Do not add .phy to outphy, the python script does it for us.
outphy = v.rsplit(".", 1)[0]
print(outphy)
cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy)
ret = subprocess.check_output(cmd, shell=True)
print(ret)
/home/iovercast/manuscript-analysis/REFMAP_SIM/stacks/batch_1 Loci file not provided. Converting only variable sites in VCF. /home/iovercast/manuscript-analysis/Phocoena_empirical/ddocent/Final.recode.snps.vcf.recode.reordered.phy Loci file not provided. Converting only variable sites in VCF. /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/TotalRawSNPs.snps.vcf.recode.phy Loci file not provided. Converting only variable sites in VCF.
For simulated datasets raxml runs very fast (<20 seconds per phylip file).
For empirical it's much slower, maybe 1-2 days per assembler. Be sure to use the '.snps.phy' ipyrad file or else raxml runs forever.
for d in [REFMAP_SIM_DIR, REFMAP_EMPIRICAL_DIR]:
raxoutdir = d + "raxml_outdir"
if not os.path.exists(raxoutdir):
os.mkdir(raxoutdir)
emp_trees = {}
sim_trees = {}
for assembler, vcffile in vcf_dict.items():
if "sim" in assembler:
outdir = REFMAP_SIM_DIR + "raxml_outdir"
ncores = 2
out_trees = sim_trees
physplit = 1
else:
outdir = REFMAP_EMPIRICAL_DIR + "raxml_outdir"
ncores = 20
out_trees = emp_trees
physplit = 2
if "ipyrad" in assembler:
inputfile = vcffile.rsplit(".", physplit)[0] + ".snps.phy"
if "stacks" in assembler:
inputfile = vcffile.rsplit(".", 2)[0] + ".phylip"
if "ddocent" in assembler:
inputfile = vcffile.rsplit(".", 1)[0] + ".phy"
if not os.path.exists(inputfile):
print("\nCan't find .phy or .phylip for - {}\n".format(inputfile.rsplit(".", 1)[0]))
cmd = "{}raxmlHPC-PTHREADS -f a ".format(WORK_DIR + "/miniconda/bin/") \
+ " -T {} ".format(ncores) \
+ " -m GTRGAMMA " \
+ " -N 100 " \
+ " -x 12345 " \
+ " -p 54321 " \
+ " -n {} ".format(assembler) \
+ " -w {} ".format(outdir) \
+ " -s {}".format(inputfile)
print(cmd)
## What's the difference?
## out_trees[assembler] = "{}/RAxML_bestTree.{}".format(outdir, assembler)
out_trees[assembler] = "{}/RAxML_bipartitions.{}".format(outdir, assembler)
#continue
if "sim" in assembler:
print(assembler)
#!time $cmd
if "empirical" in assembler:
print(assembler)
#!time $cmd
Can't find .phy or .phylip for - /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/Final.recode.snps.vcf.recode /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 2 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ddocent-fin-sim -w /home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir -s /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/Final.recode.snps.vcf.recode.phy ddocent-fin-sim Can't find .phy or .phylip for - /home/iovercast/manuscript-analysis/Phocoena_empirical/stacks/batch_1 /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n stacks-empirical -w /home/iovercast/manuscript-analysis/Phocoena_empirical/raxml_outdir -s /home/iovercast/manuscript-analysis/Phocoena_empirical/stacks/batch_1.phylip stacks-empirical /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 2 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad-denovo_minus_reference-sim -w /home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir -s /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.snps.phy ipyrad-denovo_minus_reference-sim /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad-denovo_plus_reference-095-empirical -w /home/iovercast/manuscript-analysis/Phocoena_empirical/raxml_outdir -s /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.snps.phy ipyrad-denovo_plus_reference-095-empirical Can't find .phy or .phylip for - /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.snps /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad-denovo_plus_reference-empirical -w /home/iovercast/manuscript-analysis/Phocoena_empirical/raxml_outdir -s /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.snps.phy ipyrad-denovo_plus_reference-empirical /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 2 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n stacks-sim -w /home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir -s /home/iovercast/manuscript-analysis/REFMAP_SIM/stacks/batch_1.phylip stacks-sim /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad-reference-empirical -w /home/iovercast/manuscript-analysis/Phocoena_empirical/raxml_outdir -s /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/refmap-empirical_outfiles/refmap-empirical.snps.phy ipyrad-reference-empirical /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad-denovo_minus_reference-empirical -w /home/iovercast/manuscript-analysis/Phocoena_empirical/raxml_outdir -s /home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.snps.phy ipyrad-denovo_minus_reference-empirical /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ddocent-fin-empirical -w /home/iovercast/manuscript-analysis/Phocoena_empirical/raxml_outdir -s /home/iovercast/manuscript-analysis/Phocoena_empirical/ddocent/Final.recode.snps.vcf.recode.reordered.phy ddocent-fin-empirical /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 2 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ddocent-tot-sim -w /home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir -s /home/iovercast/manuscript-analysis/REFMAP_SIM/ddocent/TotalRawSNPs.snps.vcf.recode.phy ddocent-tot-sim /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 2 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad-reference-sim -w /home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir -s /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/refmap-sim_outfiles/refmap-sim.snps.phy ipyrad-reference-sim /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 2 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad-denovo_plus_reference-sim -w /home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir -s /home/iovercast/manuscript-analysis/REFMAP_SIM/ipyrad/reference-assembly/denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.snps.phy ipyrad-denovo_plus_reference-sim
#!conda install biopython
from Bio import Phylo
f, axarr = plt.subplots(1, 5, figsize=(20,8), dpi=1000)
## For more than 1 row uncomment this
#axarr = [a for b in axarr for a in b]
## Pop membership and colors per pop were estabilshed above during the PCA
#sim_sample_names = pop1 + pop2 + pop3
#sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
#sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
sim_colors_per_sample = {}
for samp in sim_sample_names:
for pop in sim_pops:
if samp in sim_pops[pop]:
sim_colors_per_sample[samp] = sim_pop_colors[pop]
## Set them in order so the plot looks nice.
#progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
# "ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
outgroup = [{'name': taxon_name} for taxon_name in pop3]
for assembler, ax in zip(sim_trees.keys(), axarr):
print(assembler, sim_trees[assembler], ax)
tree = Phylo.read(sim_trees[assembler], 'newick')
tree.ladderize()
tree.root_with_outgroup(*outgroup)
## This could be cool but doesn't work
Phylo.draw(tree, axes = ax, do_show=False, label_colors=sim_colors_per_sample)
ax.set_title(assembler)
plt.tight_layout()
('ddocent-fin-sim', '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ddocent-fin-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba1b1d890>) ('ipyrad-denovo_minus_reference-sim', '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ipyrad-denovo_minus_reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba9fa5110>) ('stacks-sim', '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.stacks-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaba9ec22d0>) ('ipyrad-reference-sim', '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ipyrad-reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa229910>) ('ipyrad-denovo_plus_reference-sim', '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ipyrad-denovo_plus_reference-sim', <matplotlib.axes._subplots.AxesSubplot object at 0x2aabaa14be10>)
del sim_trees['ddocent-tot-sim']
print(len(sim_trees))
print(sim_trees)
5 {'ddocent-fin-sim': '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ddocent-fin-sim', 'ipyrad-denovo_minus_reference-sim': '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ipyrad-denovo_minus_reference-sim', 'stacks-sim': '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.stacks-sim', 'ipyrad-reference-sim': '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ipyrad-reference-sim', 'ipyrad-denovo_plus_reference-sim': '/home/iovercast/manuscript-analysis/REFMAP_SIM/raxml_outdir/RAxML_bipartitions.ipyrad-denovo_plus_reference-sim'}
## Set empirical colors
emp_colors_per_sample = {}
for samp in emp_sample_names:
for pop in emp_pops:
if samp in emp_pops[pop]:
emp_colors_per_sample[samp] = emp_pop_colors[pop]
import ete3
for prog in all_calldata.keys():
#for prog in ["ipyrad-reference-sim"]:
print("Doing {}".format(prog))
try:
c = all_calldata[prog]
v = all_vardata[prog]
except:
print("Failed to load for {}".format(prog))
continue
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(prog, distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
Doing ipyrad-reference-sim Doing ipyrad-denovo_reference-sim Doing stacks-sim Doing ddocent-simipyrad-reference-empirical Failed to load for ddocent-simipyrad-reference-empirical Doing ipyrad-denovo_reference-empirical Failed to load for ipyrad-denovo_reference-empirical Doing stacks-empirical Doing ddocent-fin-empirical
## Checking out differences between 0.85 and 0.95 clustering threshold for ipyrad
filename = "/home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference-0.95_outfiles/denovo_ref-empirical.vcf"
## Only need to load once
#myv = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
#myc = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
f, axarr = plt.subplots(1, 2, figsize=(20,8), dpi=1000)
## Make a list out of the axes
## Set them in order so the plot looks nice.
progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\
"ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"]
rundict = {"0.85":all_calldata["ipyrad-denovo_minus_reference-empirical"], "0.95":myc}
for prog, ax in zip(rundict.keys(), axarr):
pop_colors = emp_pop_colors
pops = emp_pops
sample_names = emp_sample_names
## Don't die if some of the runs aren't complete
try:
coords1, model1 = getPCA(rundict[prog])
except:
continue
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog, style="italic")
ax.axison = True
axarr[0].legend(frameon=True)
<matplotlib.legend.Legend at 0x2aaba8c59b90>
def plot_ld(gn, title):
m = allel.stats.rogers_huff_r(gn) ** 2
ax = allel.plot.pairwise_ld(m)
ax.set_title(title)
def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5):
for i in range(n_iter):
loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold)
n = np.count_nonzero(loc_unlinked)
n_remove = gn.shape[0] - n
print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants')
gn = gn.compress(loc_unlinked, axis=0)
return gn
g = allel.GenotypeArray(myc.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
gnu = ld_prune(gn, size=50, step=200, threshold=.1, n_iter=5)
('iteration', 1, 'retaining', 70451, 'removing', 12654, 'variants') ('iteration', 2, 'retaining', 60084, 'removing', 10367, 'variants') ('iteration', 3, 'retaining', 51456, 'removing', 8628, 'variants') ('iteration', 4, 'retaining', 44316, 'removing', 7140, 'variants') ('iteration', 5, 'retaining', 38444, 'removing', 5872, 'variants')
coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')