## Prereqs for plotting and analysis
#!conda install matplotlib
#!conda install libgcc # for vcfnp
#!pip install vcfnp
#!pip install scikit-allel
#!conda install -y seaborn
#!pip install toyplot
#!pip install numpy_indexed
Collecting numpy-indexed
Collecting future (from numpy-indexed)
Downloading future-0.16.0.tar.gz (824kB)
100% |████████████████████████████████| 829kB 426kB/s
Requirement already satisfied (use --upgrade to upgrade): pyyaml in /home/iovercast/opt/miniconda/lib/python2.7/site-packages (from numpy-indexed)
Building wheels for collected packages: future
Running setup.py bdist_wheel for future ... - \ | / - \ | done
Stored in directory: /home/iovercast/.cache/pip/wheels/c2/50/7c/0d83b4baac4f63ff7a765bd16390d2ab43c93587fac9d6017a
Successfully built future
Installing collected packages: future, numpy-indexed
Successfully installed future-0.16.0 numpy-indexed-0.3.4
You are using pip version 8.1.1, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
## Imports and working/output directories directories
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [12,9]
from collections import Counter
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
import toyplot
import toyplot.html ## toypot sublib for saving html plots
import pandas as pd
import numpy as np
import collections
import allel
import vcfnp
import shutil
import glob
import os
from allel.util import * ## for ensure_square()
## Set the default directories for exec and data.
WORK_DIR="/home/iovercast/manuscript-analysis/"
EMPERICAL_DATA_DIR=os.path.join(WORK_DIR, "example_empirical_rad/")
IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/")
PYRAD_DIR=os.path.join(WORK_DIR, "pyrad/")
STACKS_DIR=os.path.join(WORK_DIR, "stacks/")
AFTRRAD_DIR=os.path.join(WORK_DIR, "aftrRAD/")
DDOCENT_DIR=os.path.join(WORK_DIR, "dDocent/")
## tmp for testing
#IPYRAD_OUTPUT = "./arch/ipyrad/REALDATA/"
#PYRAD_OUTPUT = "./arch/pyrad/REALDATA/"
os.chdir(WORK_DIR)
Next will do some housekeeping to get our empirical samples sorted into species.
## Get sample names and assign them to a species dict
IPYRAD_STATS=os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_names = [x.strip().split()[0] for x in infile[20:33]]
species = set([x.split("_")[1] for x in sample_names])
species_dict = collections.OrderedDict([])
## Ordered dict of sample names and their species, in same order
## as the vcf file
for s in sample_names:
species_dict[s] = s.split("_")[1]
print(species_dict)
## Map species names to groups of individuals
#for s in species:
# species_dict[s] = [x for x in sample_names if s in x]
#print(species_dict)
species_colors = {
'rex': '#FF0000',
'cyathophylla': '#008000',
'thamno': '#00FFFF',
'cyathophylloides': '#90EE90',
'przewalskii': '#FFA500',
'superba': '#8B0000',
}
OrderedDict([('29154_superba', 'superba'), ('30556_thamno', 'thamno'), ('30686_cyathophylla', 'cyathophylla'), ('32082_przewalskii', 'przewalskii'), ('33413_thamno', 'thamno'), ('33588_przewalskii', 'przewalskii'), ('35236_rex', 'rex'), ('35855_rex', 'rex'), ('38362_rex', 'rex'), ('39618_rex', 'rex'), ('40578_rex', 'rex'), ('41478_cyathophylloides', 'cyathophylloides'), ('41954_cyathophylloides', 'cyathophylloides')])
def plotPCA(call_data, title):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(1, 1, 1)
sns.despine(ax=ax, offset=5)
x = coords1[:, 0]
y = coords1[:, 1]
## We know this works because the species_dict and the columns in the vcf
## are in the same order.
for sp in species:
flt = (np.array(species_dict.values()) == sp)
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold')
fig.tight_layout()
def getPCA(call_data):
c = call_data
g = allel.GenotypeArray(c.genotype)
ac = g.count_alleles()
## Filter singletons and multi-allelic snps
flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
gf = g.compress(flt, axis=0)
gn = gf.to_n_alt()
coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
return coords1, model1
"""
fig = plt.figure(figsize=(5, 5))
x = coords1[:, 0]
y = coords1[:, 1]
## We know this works because the species_dict and the columns in the vcf
## are in the same order.
# for sp in species:
# flt = (np.array(species_dict.values()) == sp)
# ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
ax.plot(x, y, marker='o', markersize=10)
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
"""
def plotPairwiseDistance(call_data, title):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
allel.plot.pairwise_distance(dist, labels=species_dict.keys())
def getDistances(call_data):
c = call_data
#c = vcfnp.calldata_2d(filename).view(np.recarray)
g = allel.GenotypeArray(c.genotype)
gn = g.to_n_alt()
dist = allel.stats.pairwise_distance(gn, metric='euclidean')
return(dist)
## Inputs to this function are two Counters where keys
## are the base position and values are snp counts.
## The counter doesn't have to be sorted because we sort internally.
def SNP_position_plot(distvar, distpis):
## The last position to consider
maxend = max(distvar.keys())
## This does two things, first it sorts in increasing
## order. Second, it creates a count bin for any position
## without snps and sets the count to 0.
distvar = [distvar[x] for x in xrange(maxend)]
distpis = [distpis[x] for x in xrange(maxend)]
## set color theme
colormap = toyplot.color.Palette()
## make a canvas
canvas = toyplot.Canvas(width=800, height=300)
## make axes
axes = canvas.cartesian(xlabel="Position along RAD loci",
ylabel="N variables sites",
gutter=65)
## x-axis
axes.x.ticks.show = True
axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"}
axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"}
axes.x.ticks.below = 5
axes.x.ticks.above = 0
axes.x.domain.max = maxend
axes.x.ticks.locator = toyplot.locator.Explicit(
range(0, maxend, 5),
map(str, range(0, maxend, 5)))
## y-axis
axes.y.ticks.show=True
axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"}
axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"}
axes.y.ticks.below = 0
axes.y.ticks.above = 5
## add fill plots
x = np.arange(0, maxend)
f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites")
f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites")
## add a horizontal dashed line at the median Nsnps per site
axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"})
axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"})
return canvas, axes
import numpy_indexed as npi
## Get the number of samples with data at each snp
def snp_coverage(call_data):
snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."])
## Fill zero values
return [snp_counts[x] for x in xrange(1, max(snp_counts.keys())+1)]
## Get the number of samples with data at each locus
def loci_coverage(var_data, call_data, assembler):
if "stacks" in assembler:
loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./."))
else:
loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./."))
counts_per_snp = []
for z in xrange(0, len(loci)):
counts_per_snp.append([x.sum() for x in loci[z][1]])
counts = collections.Counter([np.max(x) for x in counts_per_snp])
## Fill all zero values
return [counts[x] for x in xrange(1, max(counts.keys())+1)]
## Get total number of snps per sample
def sample_nsnps(call_data):
return [x.sum() for x in call_data["GT"].T != "./."]
## Get total number of loci per sample
def sample_nloci(var_data, call_data, assembler):
if "stacks" in assembler:
locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")
else:
locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.")
by_locus = [x.T for x in locus_groups[1]]
by_sample = np.array([(x).any(axis=1) for x in by_locus])
return [x.sum() for x in by_sample.T]
## Make a new pandas dataframe for holding the coverage results
## this is going to
sim_vcf_dict = {}
for sim in ["simno", "simlo", "simhi", "simlarge"]:
sim_vcf_dict["ipyrad-"+sim] = os.path.join(IPYRAD_DIR, "SIMDATA/{}/{}_outfiles/{}.vcf".format(sim, sim, sim))
sim_vcf_dict["pyrad-"+sim] = os.path.join(PYRAD_DIR, "SIMDATA/{}/outfiles/c85d6m2p3H3N3.vcf".format(sim))
sim_vcf_dict["stacks_ungapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/ungapped/{}/batch_1.vcf".format(sim))
sim_vcf_dict["stacks_gapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/gapped/{}/batch_1.vcf".format(sim))
sim_vcf_dict["stacks_defualt-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/default/{}/batch_1.vcf".format(sim))
sim_vcf_dict["aftrrad-"+sim] = os.path.join(AFTRRAD_DIR, "SIMDATA/{}/Formatting/{}.vcf".format(sim, sim))
sim_vcf_dict["ddocent_full-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/TotalRawSNPs.vcf".format(sim, sim))
sim_vcf_dict["ddocent_filt-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/Final.recode.vcf".format(sim, sim))
Each program writes out a vcf with some little quirks, so we have to smooth them all out. gunzip and filter out all non-biallelic loci. You only need to run this cell once, if you are reanalyzing the simdata.
for k, f in sim_vcf_dict.items():
## If it's gzipped then unzip it (only applies to ipyrad)
if os.path.exists(f + ".gz"):
print("gunzipping - {}".format(f+".gz"))
cmd = "gunzip -c {}.gz > {}".format(f, f)
!$cmd
f = f.split(".gz")[0]
sim_vcf_dict[k] = f
if os.path.exists(f):
print("found - {}".format(f))
if "stacks" in k:
## The version of vcftools we have here doesn't like the newer version of vcf stacks
## writes, so we have to 'fix' the stacks vcf version
shutil.copy2(f, f+".bak")
with open(f) as infile:
dat = infile.readlines()[1:]
with open(f, 'w') as outfile:
outfile.write("##fileformat=VCFv4.1\n")
outfile.write("".join(dat))
## Actually don't do this, it removes information about the assembly.
## I don't know why i wanted to do this in the first place. Habit?
## try:
## ## Remove all but biallelic.
## outfile = f.split(".vcf")[0]+"-biallelic"
## cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \
## .format(DDOCENT_DIR, f, outfile)
## #print(cmd)
## os.system(cmd)
##
## ## update the vcf_dict
## sim_vcf_dict[k] = outfile + ".recode.vcf"
## except Exception as inst:
## print(inst)
else:
print("not found - {}".format(f))
found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf
Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later).
import collections
sim_loc_cov = collections.OrderedDict()
sim_snp_cov = collections.OrderedDict()
sim_sample_nsnps = collections.OrderedDict()
sim_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
try:
print("Doing - {}".format(prog))
print("\t{}".format(filename))
v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)
sim_snp_cov[prog] = snp_coverage(c)
sim_sample_nsnps[prog] = sample_nsnps(c)
## aftrRAD doesn't retain the kind of info we'd need for the next two
if "aftrrad" in prog:
continue
sim_loc_cov[prog] = loci_coverage(v, c, prog)
sim_sample_nlocs[prog] = sample_nloci(v, c, prog)
except Exception as inst:
print(inst)
[vcfnp] 2016-11-27 16:53:46.063946 :: caching is disabled [vcfnp] 2016-11-27 16:53:46.064616 :: building array
Doing - ipyrad-simlarge /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:53:50.046048 :: caching is disabled [vcfnp] 2016-11-27 16:53:50.046543 :: building array [vcfnp] 2016-11-27 16:54:20.066691 :: caching is disabled [vcfnp] 2016-11-27 16:54:20.067386 :: building array
Doing - pyrad-simhi /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:54:20.493348 :: caching is disabled [vcfnp] 2016-11-27 16:54:20.493821 :: building array [vcfnp] 2016-11-27 16:54:23.319907 :: caching is disabled [vcfnp] 2016-11-27 16:54:23.320572 :: building array
Doing - pyrad-simlo /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:54:23.765458 :: caching is disabled [vcfnp] 2016-11-27 16:54:23.765922 :: building array [vcfnp] 2016-11-27 16:54:26.717321 :: caching is disabled [vcfnp] 2016-11-27 16:54:26.717919 :: building array
Doing - ddocent_full-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:54:38.023720 :: caching is disabled [vcfnp] 2016-11-27 16:54:38.024183 :: building array [vcfnp] 2016-11-27 16:55:53.857176 :: caching is disabled [vcfnp] 2016-11-27 16:55:53.857903 :: building array
Doing - ddocent_filt-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:55:55.048663 :: caching is disabled [vcfnp] 2016-11-27 16:55:55.049266 :: building array [vcfnp] 2016-11-27 16:56:02.929772 :: caching is disabled [vcfnp] 2016-11-27 16:56:02.930457 :: building array
Doing - stacks_gapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:56:03.477877 :: caching is disabled [vcfnp] 2016-11-27 16:56:03.478366 :: building array [vcfnp] 2016-11-27 16:56:08.162218 :: caching is disabled [vcfnp] 2016-11-27 16:56:08.162971 :: building array
Doing - stacks_ungapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:56:13.256345 :: caching is disabled [vcfnp] 2016-11-27 16:56:13.256817 :: building array [vcfnp] 2016-11-27 16:57:00.367378 :: caching is disabled [vcfnp] 2016-11-27 16:57:00.368017 :: building array
Doing - aftrrad-simno /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:57:00.673033 :: caching is disabled [vcfnp] 2016-11-27 16:57:00.673508 :: building array [vcfnp] 2016-11-27 16:57:02.487800 :: caching is disabled [vcfnp] 2016-11-27 16:57:02.488378 :: building array
Doing - aftrrad-simlo /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:57:02.816922 :: caching is disabled [vcfnp] 2016-11-27 16:57:02.817355 :: building array [vcfnp] 2016-11-27 16:57:04.730942 :: caching is disabled [vcfnp] 2016-11-27 16:57:04.731550 :: building array
Doing - ddocent_filt-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:57:06.023200 :: caching is disabled [vcfnp] 2016-11-27 16:57:06.023701 :: building array [vcfnp] 2016-11-27 16:57:14.012106 :: caching is disabled [vcfnp] 2016-11-27 16:57:14.012827 :: building array [vcfnp] 2016-11-27 16:57:14.103613 :: caching is disabled [vcfnp] 2016-11-27 16:57:14.104090 :: building array
Doing - stacks_gapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:57:14.906996 :: caching is disabled [vcfnp] 2016-11-27 16:57:14.907457 :: building array
Doing - ipyrad-simlo /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:57:15.318753 :: caching is disabled [vcfnp] 2016-11-27 16:57:15.319228 :: building array [vcfnp] 2016-11-27 16:57:18.347953 :: caching is disabled [vcfnp] 2016-11-27 16:57:18.348643 :: building array
Doing - stacks_defualt-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:57:18.757473 :: caching is disabled [vcfnp] 2016-11-27 16:57:18.757896 :: building array [vcfnp] 2016-11-27 16:57:22.446983 :: caching is disabled [vcfnp] 2016-11-27 16:57:22.447555 :: building array
Doing - ddocent_filt-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:57:33.454647 :: caching is disabled [vcfnp] 2016-11-27 16:57:33.455088 :: building array [vcfnp] 2016-11-27 16:58:47.221461 :: caching is disabled [vcfnp] 2016-11-27 16:58:47.222295 :: building array [vcfnp] 2016-11-27 16:58:47.238296 :: caching is disabled [vcfnp] 2016-11-27 16:58:47.238777 :: building array [vcfnp] 2016-11-27 16:58:47.295944 :: caching is disabled [vcfnp] 2016-11-27 16:58:47.296469 :: building array
Doing - stacks_gapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf Doing - ipyrad-simno /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:58:47.749539 :: caching is disabled [vcfnp] 2016-11-27 16:58:47.750019 :: building array [vcfnp] 2016-11-27 16:58:50.666550 :: caching is disabled [vcfnp] 2016-11-27 16:58:50.667183 :: building array
Doing - aftrrad-simlarge /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:58:52.924963 :: caching is disabled [vcfnp] 2016-11-27 16:58:52.925433 :: building array [vcfnp] 2016-11-27 16:59:07.577632 :: caching is disabled [vcfnp] 2016-11-27 16:59:07.578327 :: building array
Doing - stacks_ungapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:59:08.104825 :: caching is disabled [vcfnp] 2016-11-27 16:59:08.105282 :: building array [vcfnp] 2016-11-27 16:59:12.857512 :: caching is disabled [vcfnp] 2016-11-27 16:59:12.858353 :: building array
Doing - aftrrad-simhi /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:59:13.149329 :: caching is disabled [vcfnp] 2016-11-27 16:59:13.149779 :: building array [vcfnp] 2016-11-27 16:59:14.874231 :: caching is disabled [vcfnp] 2016-11-27 16:59:14.875006 :: building array
Doing - stacks_ungapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:59:15.416360 :: caching is disabled [vcfnp] 2016-11-27 16:59:15.416977 :: building array [vcfnp] 2016-11-27 16:59:20.232799 :: caching is disabled [vcfnp] 2016-11-27 16:59:20.233563 :: building array
Doing - stacks_defualt-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 16:59:24.182469 :: caching is disabled [vcfnp] 2016-11-27 16:59:24.183060 :: building array [vcfnp] 2016-11-27 17:00:01.487897 :: caching is disabled [vcfnp] 2016-11-27 17:00:01.488714 :: building array
Doing - stacks_ungapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:00:02.032021 :: caching is disabled [vcfnp] 2016-11-27 17:00:02.032495 :: building array [vcfnp] 2016-11-27 17:00:07.044474 :: caching is disabled [vcfnp] 2016-11-27 17:00:07.045104 :: building array
Doing - ipyrad-simhi /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:00:07.802941 :: caching is disabled [vcfnp] 2016-11-27 17:00:07.803469 :: building array [vcfnp] 2016-11-27 17:00:10.822673 :: caching is disabled [vcfnp] 2016-11-27 17:00:10.823271 :: building array
Doing - pyrad-simno /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:00:11.291555 :: caching is disabled [vcfnp] 2016-11-27 17:00:11.292032 :: building array [vcfnp] 2016-11-27 17:00:14.220085 :: caching is disabled [vcfnp] 2016-11-27 17:00:14.220821 :: building array
Doing - ddocent_full-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:00:15.503526 :: caching is disabled [vcfnp] 2016-11-27 17:00:15.503993 :: building array [vcfnp] 2016-11-27 17:00:23.207787 :: caching is disabled [vcfnp] 2016-11-27 17:00:23.208546 :: building array
Doing - stacks_defualt-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:00:23.640669 :: caching is disabled [vcfnp] 2016-11-27 17:00:23.641165 :: building array [vcfnp] 2016-11-27 17:00:27.520616 :: caching is disabled [vcfnp] 2016-11-27 17:00:27.521816 :: building array
Doing - pyrad-simlarge /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:00:31.749149 :: caching is disabled [vcfnp] 2016-11-27 17:00:31.749765 :: building array [vcfnp] 2016-11-27 17:00:59.712859 :: caching is disabled [vcfnp] 2016-11-27 17:00:59.713690 :: building array
Doing - ddocent_full-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:01:00.898081 :: caching is disabled [vcfnp] 2016-11-27 17:01:00.898895 :: building array [vcfnp] 2016-11-27 17:01:08.917655 :: caching is disabled [vcfnp] 2016-11-27 17:01:08.918244 :: building array
Doing - stacks_gapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:01:14.105084 :: caching is disabled [vcfnp] 2016-11-27 17:01:14.105781 :: building array [vcfnp] 2016-11-27 17:01:59.933549 :: caching is disabled [vcfnp] 2016-11-27 17:01:59.934387 :: building array
Doing - stacks_defualt-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:02:00.344161 :: caching is disabled [vcfnp] 2016-11-27 17:02:00.344609 :: building array [vcfnp] 2016-11-27 17:02:04.000344 :: caching is disabled [vcfnp] 2016-11-27 17:02:04.000860 :: building array
Doing - ddocent_filt-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:02:05.309855 :: caching is disabled [vcfnp] 2016-11-27 17:02:05.310451 :: building array [vcfnp] 2016-11-27 17:02:13.334594 :: caching is disabled [vcfnp] 2016-11-27 17:02:13.335213 :: building array
Doing - ddocent_full-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf
[vcfnp] 2016-11-27 17:02:14.594919 :: caching is disabled [vcfnp] 2016-11-27 17:02:14.595400 :: building array
In case we need to restart the notebook then we won't have to come back and reload all the data
res_dict = {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
"sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}
res = WORK_DIR + "RESULTS/"
if not os.path.exists(res):
os.mkdir(res)
for k,v in res_dict.items():
df = pd.DataFrame(v)
df.to_csv(res + k + ".csv")
If you ever need to do this.
for k,v in res_dict.items():
df = pd.read_csv(res + k + ".csv")
print(df)
Unnamed: 0 ipyrad-simlarge pyrad-simhi pyrad-simlo \ 0 0 0 0 0 1 1 12 0 0 2 2 35 0 0 3 3 436 0 0 4 4 137 0 0 5 5 70 2 0 6 6 256 0 0 7 7 1737 0 0 8 8 1688 0 0 9 9 2327 0 0 10 10 9920 1 0 11 11 81527 9875 9884 ddocent_full-simlarge ddocent_filt-simlo stacks_gapped-simno \ 0 1 0 0 1 2 0 0 2 38 0 0 3 433 0 0 4 139 0 0 5 70 0 0 6 258 0 0 7 1732 0 0 8 1689 0 0 9 2322 0 5 10 9808 0 329 11 81397 9853 9559 stacks_ungapped-simlarge ddocent_filt-simno stacks_gapped-simlo \ 0 5 0 0 1 14 0 0 2 42 0 0 3 440 0 0 4 140 0 0 5 82 0 0 6 296 0 0 7 1743 0 1 8 1729 0 0 9 2627 0 2 10 12400 0 57 11 78902 9852 1452 ... ipyrad-simhi pyrad-simno ddocent_full-simhi \ 0 ... 0 0 0 1 ... 0 0 0 2 ... 0 0 0 3 ... 1 0 2 4 ... 0 0 0 5 ... 0 0 0 6 ... 0 0 0 7 ... 1 0 0 8 ... 0 0 1 9 ... 0 0 0 10 ... 1 0 4 11 ... 9842 9894 9844 stacks_defualt-simno pyrad-simlarge ddocent_full-simlo \ 0 350 0 0 1 259 13 0 2 365 38 0 3 528 446 0 4 68 141 0 5 60 72 0 6 153 258 0 7 668 1748 0 8 315 1704 0 9 338 2342 0 10 1311 9952 0 11 6352 81709 9853 stacks_gapped-simlarge stacks_defualt-simlo ddocent_filt-simhi \ 0 5 356 0 1 14 278 0 2 42 386 0 3 440 532 0 4 140 74 0 5 82 64 0 6 296 178 0 7 1743 690 0 8 1729 332 0 9 2627 371 0 10 12400 1447 4 11 78902 6105 9844 ddocent_full-simno 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 10 0 11 9852 [12 rows x 29 columns] Unnamed: 0 ipyrad-simlarge pyrad-simhi pyrad-simlo \ 0 0 95399 9877 9884 1 1 95428 9877 9884 2 2 95405 9877 9884 3 3 95456 9877 9884 4 4 95450 9877 9884 5 5 95412 9876 9884 6 6 95437 9877 9884 7 7 95418 9877 9884 8 8 95283 9877 9884 9 9 95294 9877 9884 10 10 95288 9877 9884 11 11 95302 9877 9884 ddocent_full-simlarge ddocent_filt-simlo stacks_gapped-simno \ 0 95169 9853 9870 1 95200 9853 9862 2 95167 9853 9865 3 95218 9853 9863 4 95203 9853 9862 5 95164 9853 9864 6 95198 9853 9864 7 95177 9853 9867 8 95048 9853 9865 9 95056 9853 9867 10 95049 9853 9863 11 95052 9853 9865 stacks_ungapped-simlarge ddocent_filt-simno stacks_gapped-simlo \ 0 95366 9852 1512 1 95401 9852 1503 2 95391 9852 1507 3 95400 9852 1503 4 95394 9852 1507 5 95367 9852 1504 6 95411 9852 1508 7 95405 9852 1508 8 95232 9852 1507 9 95276 9852 1506 10 95268 9852 1508 11 95271 9852 1506 ... ipyrad-simhi pyrad-simno ddocent_full-simhi \ 0 ... 9844 9894 9848 1 ... 9844 9894 9848 2 ... 9844 9894 9848 3 ... 9844 9894 9850 4 ... 9844 9894 9850 5 ... 9843 9894 9850 6 ... 9844 9894 9848 7 ... 9844 9894 9848 8 ... 9844 9894 9849 9 ... 9844 9894 9850 10 ... 9844 9894 9850 11 ... 9844 9894 9850 stacks_defualt-simno pyrad-simlarge ddocent_full-simlo \ 0 9320 95646 9853 1 9303 95675 9853 2 9253 95649 9853 3 9096 95708 9853 4 9042 95695 9853 5 9021 95659 9853 6 8958 95686 9853 7 8934 95672 9853 8 8846 95543 9853 9 8823 95552 9853 10 8767 95544 9853 11 8687 95558 9853 stacks_gapped-simlarge stacks_defualt-simlo ddocent_filt-simhi \ 0 95366 9275 9848 1 95401 9254 9848 2 95391 9207 9848 3 95400 9055 9848 4 95394 9020 9848 5 95367 8993 9848 6 95411 8929 9846 7 95405 8888 9847 8 95232 8807 9847 9 95276 8779 9848 10 95268 8741 9848 11 95271 8645 9848 ddocent_full-simno 0 9852 1 9852 2 9852 3 9852 4 9852 5 9852 6 9852 7 9852 8 9852 9 9852 10 9852 11 9852 [12 rows x 29 columns] Unnamed: 0 ipyrad-simlarge pyrad-simhi pyrad-simlo \ 0 0 399293 42421 43106 1 1 399339 42421 43105 2 2 399224 42417 43105 3 3 399319 42423 43099 4 4 399355 42416 43100 5 5 399039 42421 43101 6 6 399259 42421 43104 7 7 399070 42421 43101 8 8 398377 42427 43103 9 9 398531 42423 43106 10 10 398376 42425 43106 11 11 398234 42423 43103 ddocent_full-simlarge ddocent_filt-simlo stacks_gapped-simno \ 0 381136 39879 43704 1 381212 39877 43636 2 381074 39878 43669 3 381093 39875 43674 4 381176 39879 43644 5 380907 39876 43677 6 381106 39879 43668 7 380897 39876 43692 8 380142 39879 43668 9 380310 39877 43694 10 380280 39879 43676 11 380121 39875 43692 stacks_ungapped-simlarge aftrrad-simno aftrrad-simlo \ 0 416856 40658 40007 1 416936 40658 40007 2 416890 40658 40007 3 416846 40658 40007 4 416861 40658 40007 5 416524 40658 40007 6 416844 40658 40007 7 416795 40658 40007 8 415751 40658 40007 9 416064 40658 40007 10 415945 40658 40007 11 415788 40658 40007 ... ipyrad-simhi pyrad-simno ddocent_full-simhi \ 0 ... 40402 43855 40041 1 ... 40402 43854 40045 2 ... 40402 43855 40041 3 ... 40402 43855 40049 4 ... 40402 43855 40054 5 ... 40399 43855 40057 6 ... 40402 43855 40043 7 ... 40402 43855 40041 8 ... 40400 43854 40053 9 ... 40400 43855 40060 10 ... 40400 43854 40054 11 ... 40400 43855 40054 stacks_defualt-simno pyrad-simlarge ddocent_full-simlo \ 0 32242 418730 39889 1 32199 418786 39887 2 32099 418656 39888 3 31740 418768 39887 4 31694 418782 39890 5 31644 418459 39889 6 31502 418659 39888 7 31443 418481 39887 8 30932 417726 39898 9 30922 417896 39893 10 30868 417747 39893 11 30683 417623 39888 stacks_gapped-simlarge stacks_defualt-simlo ddocent_filt-simhi \ 0 416856 31446 40008 1 416936 31372 40010 2 416890 31298 40003 3 416846 30969 40003 4 416861 30944 40008 5 416524 30881 40008 6 416844 30741 40002 7 416795 30662 40001 8 415751 30184 40005 9 416064 30149 40010 10 415945 30129 40004 11 415788 29904 40002 ddocent_full-simno 0 39981 1 39980 2 39980 3 39980 4 39980 5 39979 6 39980 7 39977 8 39981 9 39979 10 39980 11 39980 [12 rows x 33 columns] Unnamed: 0 ipyrad-simlarge pyrad-simhi pyrad-simlo \ 0 0 0 1 1 1 1 22 10 0 2 2 46 6 1 3 3 786 6 1 4 4 305 1 1 5 5 175 14 0 6 6 721 0 0 7 7 5301 10 2 8 8 5886 7 1 9 9 8687 6 2 10 10 39920 60 19 11 11 346258 42337 43081 ddocent_full-simlarge ddocent_filt-simlo stacks_gapped-simno \ 0 3 0 0 1 3 0 0 2 53 0 0 3 758 0 0 4 299 0 0 5 164 0 0 6 696 0 0 7 5106 0 0 8 5745 0 0 9 8397 0 24 10 37784 19 1362 11 330563 39860 42406 stacks_ungapped-simlarge aftrrad-simno aftrrad-simlo \ 0 6 0 0 1 25 0 0 2 56 0 0 3 819 0 0 4 317 0 0 5 201 0 0 6 880 0 0 7 5515 0 0 8 6288 0 0 9 10290 0 0 10 52091 0 0 11 350753 40658 40007 ... ipyrad-simhi pyrad-simno ddocent_full-simhi \ 0 ... 0 0 25 1 ... 0 0 3 2 ... 0 0 7 3 ... 3 0 11 4 ... 0 0 2 5 ... 0 0 2 6 ... 0 0 0 7 ... 5 0 8 8 ... 0 0 20 9 ... 0 0 7 10 ... 3 3 80 11 ... 40394 43852 39940 stacks_defualt-simno pyrad-simlarge ddocent_full-simlo \ 0 469 0 11 1 327 24 0 2 520 50 1 3 840 827 2 4 128 319 2 5 134 180 0 6 348 761 0 7 1715 5535 2 8 920 6183 2 9 1063 9158 1 10 4497 41802 19 11 23829 363123 39866 stacks_gapped-simlarge stacks_defualt-simlo ddocent_filt-simhi \ 0 6 472 0 1 25 347 0 2 56 539 0 3 819 833 0 4 317 143 0 5 201 134 0 6 880 397 0 7 5515 1754 0 8 6288 957 0 9 10290 1155 0 10 52091 4861 80 11 350753 22550 39932 ddocent_full-simno 0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 1 8 0 9 1 10 9 11 39970 [12 rows x 33 columns]
This isn't very helpful, but you get an idea of what's going on. Values here for stacks gapped analysis look weird because they are bad. The populations program was segfaulting during creation of the vcf, so it's super truncated. Not sure what the problem is. The results down below the plots pull in stats from other files so they are better.
for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
"sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items():
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
try:
print(prog+sim + " " + statname + "\t"),
print(stat[prog + sim]),
print(np.mean(stat[prog + sim]))
except:
print("No {} stats for {}".format(statname, prog + sim))
print("------------------------------------------------------")
ipyrad-simno sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9870] 822.5 pyrad-simno sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9894] 824.5 stacks_ungapped-simno sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 329, 9559] 824.416666667 stacks_gapped-simno sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 329, 9559] 824.416666667 stacks_defualt-simno sim_loc_cov [350, 259, 365, 528, 68, 60, 153, 668, 315, 338, 1311, 6352] 897.25 ddocent_filt-simno sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9852] 821.0 ddocent_full-simno sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9852] 821.0 aftrrad-simno sim_loc_cov No sim_loc_cov stats for aftrrad-simno ------------------------------------------------------ ipyrad-simlo sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9859] 821.583333333 pyrad-simlo sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9884] 823.666666667 stacks_ungapped-simlo sim_loc_cov [24, 30, 31, 38, 6, 2, 8, 48, 48, 66, 573, 9131] 833.75 stacks_gapped-simlo sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 57, 1452] 126.0 stacks_defualt-simlo sim_loc_cov [356, 278, 386, 532, 74, 64, 178, 690, 332, 371, 1447, 6105] 901.083333333 ddocent_filt-simlo sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9853] 821.083333333 ddocent_full-simlo sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9853] 821.083333333 aftrrad-simlo sim_loc_cov No sim_loc_cov stats for aftrrad-simlo ------------------------------------------------------ ipyrad-simhi sim_loc_cov [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9842] 820.416666667 pyrad-simhi sim_loc_cov [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9875] 823.166666667 stacks_ungapped-simhi sim_loc_cov [54, 59, 66, 96, 17, 12, 18, 138, 106, 132, 943, 8509] 845.833333333 stacks_gapped-simhi sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 56] 5.0 stacks_defualt-simhi sim_loc_cov [369, 288, 402, 556, 86, 78, 209, 714, 367, 444, 1614, 5738] 905.416666667 ddocent_filt-simhi sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9844] 820.666666667 ddocent_full-simhi sim_loc_cov [0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 4, 9844] 820.916666667 aftrrad-simhi sim_loc_cov No sim_loc_cov stats for aftrrad-simhi ------------------------------------------------------ ipyrad-simlarge sim_loc_cov [0, 12, 35, 436, 137, 70, 256, 1737, 1688, 2327, 9920, 81527] 8178.75 pyrad-simlarge sim_loc_cov [0, 13, 38, 446, 141, 72, 258, 1748, 1704, 2342, 9952, 81709] 8201.91666667 stacks_ungapped-simlarge sim_loc_cov [5, 14, 42, 440, 140, 82, 296, 1743, 1729, 2627, 12400, 78902] 8201.66666667 stacks_gapped-simlarge sim_loc_cov [5, 14, 42, 440, 140, 82, 296, 1743, 1729, 2627, 12400, 78902] 8201.66666667 stacks_defualt-simlarge sim_loc_cov [3666, 2729, 3623, 5435, 913, 878, 2294, 7764, 4321, 5593, 17308, 52404] 8910.66666667 ddocent_filt-simlarge sim_loc_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9808, 81396] 7600.33333333 ddocent_full-simlarge sim_loc_cov [1, 2, 38, 433, 139, 70, 258, 1732, 1689, 2322, 9808, 81397] 8157.41666667 aftrrad-simlarge sim_loc_cov No sim_loc_cov stats for aftrrad-simlarge ------------------------------------------------------ ipyrad-simno sim_sample_nlocs [9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870] 9870.0 pyrad-simno sim_sample_nlocs [9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894] 9894.0 stacks_ungapped-simno sim_sample_nlocs [9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75 stacks_gapped-simno sim_sample_nlocs [9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75 stacks_defualt-simno sim_sample_nlocs [9320, 9303, 9253, 9096, 9042, 9021, 8958, 8934, 8846, 8823, 8767, 8687] 9004.16666667 ddocent_filt-simno sim_sample_nlocs [9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0 ddocent_full-simno sim_sample_nlocs [9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0 aftrrad-simno sim_sample_nlocs No sim_sample_nlocs stats for aftrrad-simno ------------------------------------------------------ ipyrad-simlo sim_sample_nlocs [9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859] 9859.0 pyrad-simlo sim_sample_nlocs [9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884] 9884.0 stacks_ungapped-simlo sim_sample_nlocs [9819, 9808, 9818, 9812, 9832, 9825, 9826, 9801, 9811, 9808, 9817, 9801] 9814.83333333 stacks_gapped-simlo sim_sample_nlocs [1512, 1503, 1507, 1503, 1507, 1504, 1508, 1508, 1507, 1506, 1508, 1506] 1506.58333333 stacks_defualt-simlo sim_sample_nlocs [9275, 9254, 9207, 9055, 9020, 8993, 8929, 8888, 8807, 8779, 8741, 8645] 8966.08333333 ddocent_filt-simlo sim_sample_nlocs [9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0 ddocent_full-simlo sim_sample_nlocs [9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0 aftrrad-simlo sim_sample_nlocs No sim_sample_nlocs stats for aftrrad-simlo ------------------------------------------------------ ipyrad-simhi sim_sample_nlocs [9844, 9844, 9844, 9844, 9844, 9843, 9844, 9844, 9844, 9844, 9844, 9844] 9843.91666667 pyrad-simhi sim_sample_nlocs [9877, 9877, 9877, 9877, 9877, 9876, 9877, 9877, 9877, 9877, 9877, 9877] 9876.91666667 stacks_ungapped-simhi sim_sample_nlocs [9741, 9769, 9746, 9746, 9756, 9747, 9756, 9745, 9727, 9731, 9717, 9715] 9741.33333333 stacks_gapped-simhi sim_sample_nlocs [60, 60, 59, 60, 60, 60, 60, 60, 60, 60, 59, 58] 59.6666666667 stacks_defualt-simhi sim_sample_nlocs [9187, 9208, 9138, 8985, 8955, 8925, 8878, 8842, 8737, 8716, 8648, 8582] 8900.08333333 ddocent_filt-simhi sim_sample_nlocs [9848, 9848, 9848, 9848, 9848, 9848, 9846, 9847, 9847, 9848, 9848, 9848] 9847.66666667 ddocent_full-simhi sim_sample_nlocs [9848, 9848, 9848, 9850, 9850, 9850, 9848, 9848, 9849, 9850, 9850, 9850] 9849.08333333 aftrrad-simhi sim_sample_nlocs No sim_sample_nlocs stats for aftrrad-simhi ------------------------------------------------------ ipyrad-simlarge sim_sample_nlocs [95399, 95428, 95405, 95456, 95450, 95412, 95437, 95418, 95283, 95294, 95288, 95302] 95381.0 pyrad-simlarge sim_sample_nlocs [95646, 95675, 95649, 95708, 95695, 95659, 95686, 95672, 95543, 95552, 95544, 95558] 95632.25 stacks_ungapped-simlarge sim_sample_nlocs [95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5 stacks_gapped-simlarge sim_sample_nlocs [95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5 stacks_defualt-simlarge sim_sample_nlocs [90013, 89989, 89284, 87596, 87548, 87334, 86818, 86236, 85427, 85225, 84641, 83680] 86982.5833333 ddocent_filt-simlarge sim_sample_nlocs [90677, 90706, 90329, 89908, 90701, 90653, 90336, 89898, 90677, 90684, 90271, 89800] 90386.6666667 ddocent_full-simlarge sim_sample_nlocs [95169, 95200, 95167, 95218, 95203, 95164, 95198, 95177, 95048, 95056, 95049, 95052] 95141.75 aftrrad-simlarge sim_sample_nlocs No sim_sample_nlocs stats for aftrrad-simlarge ------------------------------------------------------ ipyrad-simno sim_sample_nsnps [41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893] 41893.0 pyrad-simno sim_sample_nsnps [43855, 43854, 43855, 43855, 43855, 43855, 43855, 43855, 43854, 43855, 43854, 43855] 43854.75 stacks_ungapped-simno sim_sample_nsnps [43704, 43636, 43669, 43674, 43644, 43677, 43668, 43692, 43668, 43694, 43676, 43692] 43674.5 stacks_gapped-simno sim_sample_nsnps [43704, 43636, 43669, 43674, 43644, 43677, 43668, 43692, 43668, 43694, 43676, 43692] 43674.5 stacks_defualt-simno sim_sample_nsnps [32242, 32199, 32099, 31740, 31694, 31644, 31502, 31443, 30932, 30922, 30868, 30683] 31497.3333333 ddocent_filt-simno sim_sample_nsnps [39974, 39973, 39973, 39973, 39974, 39973, 39974, 39971, 39974, 39973, 39974, 39973] 39973.25 ddocent_full-simno sim_sample_nsnps [39981, 39980, 39980, 39980, 39980, 39979, 39980, 39977, 39981, 39979, 39980, 39980] 39979.75 aftrrad-simno sim_sample_nsnps [40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658] 40658.0 ------------------------------------------------------ ipyrad-simlo sim_sample_nsnps [41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126] 41126.0 pyrad-simlo sim_sample_nsnps [43106, 43105, 43105, 43099, 43100, 43101, 43104, 43101, 43103, 43106, 43106, 43103] 43103.25 stacks_ungapped-simlo sim_sample_nsnps [42878, 42825, 42865, 42829, 42921, 42896, 42910, 42819, 42825, 42861, 42865, 42806] 42858.3333333 stacks_gapped-simlo sim_sample_nsnps [6664, 6634, 6642, 6619, 6645, 6637, 6645, 6646, 6646, 6649, 6646, 6638] 6642.58333333 stacks_defualt-simlo sim_sample_nsnps [31446, 31372, 31298, 30969, 30944, 30881, 30741, 30662, 30184, 30149, 30129, 29904] 30723.25 ddocent_filt-simlo sim_sample_nsnps [39879, 39877, 39878, 39875, 39879, 39876, 39879, 39876, 39879, 39877, 39879, 39875] 39877.4166667 ddocent_full-simlo sim_sample_nsnps [39889, 39887, 39888, 39887, 39890, 39889, 39888, 39887, 39898, 39893, 39893, 39888] 39889.75 aftrrad-simlo sim_sample_nsnps [40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007] 40007.0 ------------------------------------------------------ ipyrad-simhi sim_sample_nsnps [40402, 40402, 40402, 40402, 40402, 40399, 40402, 40402, 40400, 40400, 40400, 40400] 40401.0833333 pyrad-simhi sim_sample_nsnps [42421, 42421, 42417, 42423, 42416, 42421, 42421, 42421, 42427, 42423, 42425, 42423] 42421.5833333 stacks_ungapped-simhi sim_sample_nsnps [42007, 42161, 42001, 42096, 42063, 42034, 42104, 42068, 41959, 41942, 41970, 42002] 42033.9166667 stacks_gapped-simhi sim_sample_nsnps [250, 250, 247, 250, 250, 250, 250, 250, 250, 250, 246, 243] 248.833333333 stacks_defualt-simhi sim_sample_nsnps [30282, 30375, 30183, 29875, 29828, 29762, 29663, 29607, 29029, 28982, 28947, 28822] 29612.9166667 ddocent_filt-simhi sim_sample_nsnps [40008, 40010, 40003, 40003, 40008, 40008, 40002, 40001, 40005, 40010, 40004, 40002] 40005.3333333 ddocent_full-simhi sim_sample_nsnps [40041, 40045, 40041, 40049, 40054, 40057, 40043, 40041, 40053, 40060, 40054, 40054] 40049.3333333 aftrrad-simhi sim_sample_nsnps [39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027] 39027.0 ------------------------------------------------------ ipyrad-simlarge sim_sample_nsnps [399293, 399339, 399224, 399319, 399355, 399039, 399259, 399070, 398377, 398531, 398376, 398234] 398951.333333 pyrad-simlarge sim_sample_nsnps [418730, 418786, 418656, 418768, 418782, 418459, 418659, 418481, 417726, 417896, 417747, 417623] 418359.416667 stacks_ungapped-simlarge sim_sample_nsnps [416856, 416936, 416890, 416846, 416861, 416524, 416844, 416795, 415751, 416064, 415945, 415788] 416508.333333 stacks_gapped-simlarge sim_sample_nsnps [416856, 416936, 416890, 416846, 416861, 416524, 416844, 416795, 415751, 416064, 415945, 415788] 416508.333333 stacks_defualt-simlarge sim_sample_nsnps [306434, 306316, 304795, 300499, 300565, 300050, 299270, 297832, 292556, 292370, 291639, 289438] 298480.333333 ddocent_filt-simlarge sim_sample_nsnps [366270, 366351, 364928, 363394, 366352, 366085, 365000, 363294, 366128, 366304, 364722, 362908] 365144.666667 ddocent_full-simlarge sim_sample_nsnps [381136, 381212, 381074, 381093, 381176, 380907, 381106, 380897, 380142, 380310, 380280, 380121] 380787.833333 aftrrad-simlarge sim_sample_nsnps [336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112] 336112.0 ------------------------------------------------------ ipyrad-simno sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41893] 3491.08333333 pyrad-simno sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 43852] 3654.58333333 stacks_ungapped-simno sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 1362, 42406] 3649.33333333 stacks_gapped-simno sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 1362, 42406] 3649.33333333 stacks_defualt-simno sim_snp_cov [469, 327, 520, 840, 128, 134, 348, 1715, 920, 1063, 4497, 23829] 2899.16666667 ddocent_filt-simno sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 39965] 3331.16666667 ddocent_full-simno sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 9, 39970] 3331.75 aftrrad-simno sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40658] 3388.16666667 ------------------------------------------------------ ipyrad-simlo sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41126] 3427.16666667 pyrad-simlo sim_snp_cov [1, 0, 1, 1, 1, 0, 0, 2, 1, 2, 19, 43081] 3592.41666667 stacks_ungapped-simlo sim_snp_cov [28, 36, 42, 67, 12, 2, 20, 145, 150, 237, 2306, 40279] 3610.33333333 stacks_gapped-simlo sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 3, 0, 6, 233, 6422] 555.333333333 stacks_defualt-simlo sim_snp_cov [472, 347, 539, 833, 143, 134, 397, 1754, 957, 1155, 4861, 22550] 2845.16666667 ddocent_filt-simlo sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 39860] 3323.25 ddocent_full-simlo sim_snp_cov [11, 0, 1, 2, 2, 0, 0, 2, 2, 1, 19, 39866] 3325.5 aftrrad-simlo sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40007] 3333.91666667 ------------------------------------------------------ ipyrad-simhi sim_snp_cov [0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 3, 40394] 3367.08333333 pyrad-simhi sim_snp_cov [1, 10, 6, 6, 1, 14, 0, 10, 7, 6, 60, 42337] 3538.16666667 stacks_ungapped-simhi sim_snp_cov [58, 103, 112, 222, 54, 41, 50, 437, 403, 528, 3826, 37297] 3594.25 stacks_gapped-simhi sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 236] 20.8333333333 stacks_defualt-simhi sim_snp_cov [477, 350, 564, 862, 169, 166, 462, 1802, 1037, 1398, 5298, 20663] 2770.66666667 ddocent_filt-simhi sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80, 39932] 3334.33333333 ddocent_full-simhi sim_snp_cov [25, 3, 7, 11, 2, 2, 0, 8, 20, 7, 80, 39940] 3342.08333333 aftrrad-simhi sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39027] 3252.25 ------------------------------------------------------ ipyrad-simlarge sim_snp_cov [0, 22, 46, 786, 305, 175, 721, 5301, 5886, 8687, 39920, 346258] 34008.9166667 pyrad-simlarge sim_snp_cov [0, 24, 50, 827, 319, 180, 761, 5535, 6183, 9158, 41802, 363123] 35663.5 stacks_ungapped-simlarge sim_snp_cov [6, 25, 56, 819, 317, 201, 880, 5515, 6288, 10290, 52091, 350753] 35603.4166667 stacks_gapped-simlarge sim_snp_cov [6, 25, 56, 819, 317, 201, 880, 5515, 6288, 10290, 52091, 350753] 35603.4166667 stacks_defualt-simlarge sim_snp_cov [4983, 3452, 5104, 8857, 1725, 1881, 5371, 20413, 12908, 18084, 60319, 194817] 28159.5 ddocent_filt-simlarge sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37780, 330513] 30691.0833333 ddocent_full-simlarge sim_snp_cov [3, 3, 53, 758, 299, 164, 696, 5106, 5745, 8397, 37784, 330563] 32464.25 aftrrad-simlarge sim_snp_cov [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 336112] 28009.3333333 ------------------------------------------------------
## Load the calldata into a dict so we don't have to keep loading and reloading it
calldata = {}
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
"stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
print("{}".format(prog+sim)),
print("{}".format(sim_vcf_dict[prog+sim]))
c = vcfnp.calldata_2d(sim_vcf_dict[prog+sim], verbose=False).view(np.recarray)
calldata[prog+sim] = c
ipyrad-simno /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf pyrad-simno /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf stacks_gapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf stacks_defualt-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf ddocent_filt-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simno /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf ipyrad-simlo /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf pyrad-simlo /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf stacks_gapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf stacks_defualt-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf ddocent_filt-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simlo /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf ipyrad-simhi /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf pyrad-simhi /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf stacks_gapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf stacks_defualt-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf ddocent_filt-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simhi /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf ipyrad-simlarge /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf pyrad-simlarge /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf stacks_gapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf stacks_defualt-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf ddocent_filt-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simlarge /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
flt = np.in1d(np.array(sim_sample_names), pop1)
print(flt)
[ True True True True False False False False False False False False]
## Some housekeeping with sample names to make the PCA plots prettier
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
sim_sample_names = pop1 + pop2 + pop3
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
#for sim in ["-simhi"]:
print("Doing - {}".format(sim))
f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
"stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr):
## Annoying debug messages
#print("{}".format(prog+sim)),
#print("{}".format(sim_vcf_dict[prog+sim]))
coords1, model1 = getPCA(calldata[prog+sim])
x = coords1[:, 0]
y = coords1[:, 1]
ax.scatter(x, y, marker='o')
ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
for pop in pops.keys():
flt = np.in1d(np.array(sim_sample_names), pops[pop])
ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
ax.set_title(prog+sim, style="italic")
ax.axison = True
f.tight_layout()
Doing - -simno Doing - -simlo Doing - -simhi Doing - -simlarge
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
#for sim in ["-simhi"]:
print("Doing - {}".format(sim))
f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000)
axarr = [a for b in axarr for a in b]
for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
"stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr):
print("{}".format(prog+sim)),
print("{}".format(sim_vcf_dict[prog+sim]))
## Calculate pairwise distances
dist = getDistances(calldata[prog+sim])
## Doing it this way works, but allel uses imshow internally which rasterizes the image
#allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False)
## Create the pcolormesh by hand
dat = ensure_square(dist)
## for some reason np.flipud(dat) is chopping off one row of data
p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
## Clip all heatmaps to actual sample size
p.axes.axis("tight")
ax.set_title(prog+sim, style="italic")
ax.axison = False
Doing - -simno ipyrad-simno /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf pyrad-simno /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf stacks_gapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf stacks_defualt-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf ddocent_filt-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simno /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf Doing - -simlo ipyrad-simlo /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf pyrad-simlo /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf stacks_gapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf stacks_defualt-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf ddocent_filt-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simlo /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf Doing - -simhi ipyrad-simhi /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf pyrad-simhi /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf stacks_gapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf stacks_defualt-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf ddocent_filt-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simhi /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf Doing - -simlarge ipyrad-simlarge /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf pyrad-simlarge /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf stacks_ungapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf stacks_gapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf stacks_defualt-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf ddocent_filt-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf ddocent_full-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf aftrrad-simlarge /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf
What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them full to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample.
sim_full_loc_cov = collections.OrderedDict()
sim_full_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
print("Doing - {}\t".format(prog)),
sim_full_loc_cov[prog] = []
sim_full_sample_nlocs[prog] = []
Doing - ipyrad-simlarge Doing - pyrad-simhi Doing - pyrad-simlo Doing - ddocent_full-simlarge Doing - ddocent_filt-simlo Doing - stacks_gapped-simno Doing - stacks_ungapped-simlarge Doing - aftrrad-simno Doing - aftrrad-simlo Doing - ddocent_filt-simno Doing - stacks_gapped-simlo Doing - ipyrad-simlo Doing - stacks_defualt-simhi Doing - ddocent_filt-simlarge Doing - stacks_gapped-simhi Doing - ipyrad-simno Doing - aftrrad-simlarge Doing - stacks_ungapped-simhi Doing - aftrrad-simhi Doing - stacks_ungapped-simno Doing - stacks_defualt-simlarge Doing - stacks_ungapped-simlo Doing - ipyrad-simhi Doing - pyrad-simno Doing - ddocent_full-simhi Doing - stacks_defualt-simno Doing - pyrad-simlarge Doing - ddocent_full-simlo Doing - stacks_gapped-simlarge Doing - stacks_defualt-simlo Doing - ddocent_filt-simhi Doing - ddocent_full-simno
## ipyrad stats
IPYRAD_SIMOUT=os.path.join(IPYRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
print("Doing - {}\t".format(sim)),
simstring = "ipyrad-" + sim
simdir = os.path.join(IPYRAD_SIMOUT, sim)
statsfile = simdir + "/{}_outfiles/{}_stats.txt".format(sim, sim)
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]]
#print(sample_coverage)
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
sim_full_sample_nlocs[simstring] = sample_coverage
nmissing = [int(x.strip().split()[1]) for x in infile[38:50]]
sim_full_loc_cov[simstring] = nmissing
## Just look at the ones we care about for ipyrad
print([(x,y) for x,y in sim_full_loc_cov.items() if "ipyrad" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ipyrad" in x])
Doing - simno mean sample coverage - 10000.0 min/max - 10000/10000 Doing - simlo mean sample coverage - 10000.0 min/max - 10000/10000 Doing - simhi mean sample coverage - 9999.91666667 min/max - 9999/10000 Doing - simlarge mean sample coverage - 96942.6666667 min/max - 96917/96986 [('ipyrad-simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82659]), ('ipyrad-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad-simhi', [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9998]), ('ipyrad_simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simhi', [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9998]), ('ipyrad_simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82659])] [('ipyrad-simlarge', [96923, 96959, 96931, 96986, 96972, 96933, 96958, 96945, 96917, 96928, 96926, 96934]), ('ipyrad-simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000])]
## ipyrad stats
PYRAD_SIMOUT=os.path.join(PYRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
simstring = "pyrad-"+sim
print("Doing - {}\t".format(sim)),
simdir = os.path.join(PYRAD_SIMOUT, sim)
statsfile = simdir + "/stats/c85d6m2p3H3N3.stats"
infile = open(statsfile).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[8:20]]
print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
sim_full_sample_nlocs[simstring] = sample_coverage
nmissing = [0] + [int(x.strip().split()[1]) for x in infile[26:37]]
sim_full_loc_cov[simstring] = nmissing
## Just look at the ones we care about for pyrad
print([(x,y) for x,y in sim_full_loc_cov.items() if "pyrad" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "pyrad" in x])
Doing - simno mean sample coverage - 10000.0 min/max - 10000/10000 Doing - simlo mean sample coverage - 10000.0 min/max - 10000/10000 Doing - simhi mean sample coverage - 9999.91666667 min/max - 9999/10000 Doing - simlarge mean sample coverage - 96943.6666667 min/max - 96918/96987 [('ipyrad-simlarge', []), ('pyrad-simhi', [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9998]), ('pyrad-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad-simlo', []), ('ipyrad-simno', []), ('ipyrad-simhi', []), ('pyrad-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('pyrad-simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82660]), ('ipyrad_simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simhi', [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9998]), ('ipyrad_simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82659]), ('pyrad_simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('pyrad_simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('pyrad_simhi', [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9998]), ('pyrad_simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82660])] [('ipyrad-simlarge', [96923, 96959, 96931, 96986, 96972, 96933, 96958, 96945, 96917, 96928, 96926, 96934]), ('pyrad-simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad-simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad-simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad-simlarge', [96924, 96960, 96932, 96987, 96973, 96934, 96959, 96946, 96918, 96929, 96927, 96935]), ('pyrad_simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad_simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad_simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad_simlarge', [96924, 96960, 96932, 96987, 96973, 96934, 96959, 96946, 96918, 96929, 96927, 96935])]
import gzip
## stacks stats
STACKS_SIMOUT=os.path.join(STACKS_DIR, "SIMDATA/")
STACKS_GAP_SIMOUT=os.path.join(STACKS_SIMOUT, "gapped/")
STACKS_UNGAP_SIMOUT=os.path.join(STACKS_SIMOUT, "ungapped/")
STACKS_DEFAULT_SIMOUT=os.path.join(STACKS_SIMOUT, "default/")
for dir in [STACKS_GAP_SIMOUT, STACKS_UNGAP_SIMOUT, STACKS_DEFAULT_SIMOUT]:
stacks_method = dir.split("/")[-2]
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#print("Doing - {}-{}".format(stacks_method, sim)),
simstring = "stacks_"+stacks_method+"-"+sim
try:
simdir = os.path.join(dir, sim)
lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines()
cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
sim_full_loc_cov[simstring] = [cnts.count(i) for i in range(1,13)]
except Exception as inst:
print("loc_cov - {} - {}".format(inst, simdir))
try:
sim_full_sample_nlocs[simstring] = []
samp_haps = glob.glob("{}/*matches*".format(simdir))
for f in samp_haps:
lines = gzip.open(f).readlines()
sim_full_sample_nlocs[simstring].append(len(lines) - 1)
except Exception as inst:
print("sample_nlocs - {} - {}".format(inst, simdir))
## Just look at the stacks results to reduce the clutter
print([(x,y) for x,y in sim_full_loc_cov.items() if "stacks" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "stacks" in x])
[('stacks_gapped-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 336, 9652]), ('stacks_ungapped-simlarge', [49, 29, 67, 593, 156, 92, 311, 1840, 1767, 2674, 12583, 79747]), ('stacks_gapped-simlo', [19, 5, 1, 3, 1, 1, 1, 1, 1, 15, 348, 9623]), ('stacks_defualt-simhi', []), ('stacks_gapped-simhi', [241, 28, 25, 28, 5, 1, 4, 29, 21, 39, 506, 9387]), ('stacks_ungapped-simhi', [1322, 174, 123, 148, 19, 13, 21, 148, 112, 137, 956, 8611]), ('stacks_ungapped-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 336, 9652]), ('stacks_defualt-simlarge', []), ('stacks_ungapped-simlo', [589, 79, 54, 56, 7, 2, 8, 52, 50, 67, 584, 9231]), ('stacks_defualt-simno', []), ('stacks_gapped-simlarge', [49, 29, 67, 593, 156, 92, 311, 1840, 1767, 2674, 12583, 79747]), ('stacks_defualt-simlo', []), ('stacks_default-simno', [3307, 709, 652, 748, 85, 66, 175, 727, 332, 353, 1349, 6446]), ('stacks_default-simlo', [3868, 754, 699, 773, 92, 73, 200, 749, 354, 387, 1488, 6207]), ('stacks_default-simhi', [4592, 837, 740, 833, 105, 90, 239, 783, 390, 465, 1660, 5842]), ('stacks_default-simlarge', [33633, 7241, 6542, 7614, 1102, 1025, 2505, 8307, 4487, 5750, 17674, 53253])] [('stacks_gapped-simno', [11381, 11435, 11446, 11406, 11400, 11432, 11468, 11442, 11393, 11438, 11466, 11365]), ('stacks_ungapped-simlarge', [110839, 110710, 110603, 110764, 110698, 110572, 110670, 110795, 110708, 110617, 110649, 110860]), ('stacks_gapped-simlo', [11365, 11399, 11420, 11375, 11381, 11395, 11435, 11423, 11354, 11408, 11428, 11343]), ('stacks_defualt-simhi', []), ('stacks_gapped-simhi', [11311, 11365, 11389, 11342, 11332, 11371, 11406, 11400, 11336, 11382, 11424, 11320]), ('stacks_ungapped-simhi', [11371, 11444, 11438, 11413, 11403, 11426, 11466, 11451, 11382, 11428, 11472, 11355]), ('stacks_ungapped-simno', [11381, 11435, 11446, 11406, 11400, 11432, 11468, 11442, 11393, 11438, 11466, 11365]), ('stacks_defualt-simlarge', []), ('stacks_ungapped-simlo', [11388, 11426, 11458, 11407, 11409, 11432, 11471, 11444, 11384, 11434, 11469, 11365]), ('stacks_defualt-simno', []), ('stacks_gapped-simlarge', [110839, 110710, 110603, 110764, 110698, 110572, 110670, 110795, 110708, 110617, 110649, 110860]), ('stacks_defualt-simlo', []), ('stacks_default-simno', [11381, 11434, 11372, 11250, 11132, 10993, 10974, 10982, 11010, 10897, 10824, 10702]), ('stacks_default-simlo', [11385, 11426, 11388, 11256, 11155, 11010, 10991, 10997, 11013, 10905, 10843, 10724]), ('stacks_default-simhi', [11371, 11443, 11368, 11271, 11155, 11017, 11002, 11025, 11022, 10924, 10878, 10739]), ('stacks_gapped-large', []), ('stacks_ungapped-large', []), ('stacks_default-large', []), ('stacks_default-simlarge', [110835, 110690, 109833, 109066, 108063, 106263, 105795, 106033, 106842, 104942, 104216, 104106])]
AftrRAD doesn't natively support vcf. the vcf files we made are in sim*/Formatting/sim*.vcf
It's not straightforward how to get the locations for snps from the output files. aftrRAD does calculate and write these out as part of the Genotype.pl phase to a file called /TempFiles/SNPLocationsToPlot.txt
It's also not straightforward how to get the number of loci recovered for each invidual.
You have to total up the counts from two different files for each individual. The
Outputs/Genotypes
directory has two files, one for Haplotypes
and one for Monomorphics
.
The haplotypes file is a giant matrix of concatenated snps per locus so you have to
search through each column to count up all the NA
's (missing data per locus per individual).
Then you have to count up the number of non-zero monomorphic sites.
AFTRRAD_SIMOUT=os.path.join(AFTRRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#for sim in ["simlo"]:
simstring = "aftrrad-{}".format(sim)
print("Doing - {}".format(simstring))
simdir = os.path.join(AFTRRAD_SIMOUT, sim)
## The `17` in the file name is the min % of samples w/ data
hapsfile = simdir + "/Output/Genotypes/Haplotypes_17.All.txt"
monofile = simdir + "/Output/Genotypes/Monomorphics_17.txt"
## Set low_memory=False or else simlarge crashes pandas
mono_df = pd.read_csv(monofile, delim_whitespace=True, header=0, low_memory=False)
haps_df = pd.read_csv(hapsfile, delim_whitespace=True, header=0, index_col=0, low_memory=False)
sample_coverage = {}
## Get the list of all the simulated sample names
samplenames = mono_df.columns.values[1:]
## Do haplotypes file first because it is phased so there are 2 rows per sample
## so we end up double counting NA's. If you read the # of monomorphics first
## then you have to figure out some way to prevent the double counting, here
## we just count twice and overwrite.
for row in haps_df.itertuples():
sample_coverage[row[0].split("Individual")[1]] = len(pd.Series(row).nonzero()[0])
for sname in samplenames:
## nonzero returns a tuple of which we are only interested in the first element
sample_coverage[sname] += len(mono_df[sname].nonzero()[0])
## To sort or not to sort the coverage_df?
print("mean sample coverage - {}".format(np.mean(sample_coverage.values()))),
print("min/max - {}/{}".format(np.min(sample_coverage.values()), np.max(sample_coverage.values())))
## Put them in the dict sorted by sample name
sim_full_sample_nlocs[simstring] = [sample_coverage[x] for x in sorted(sample_coverage.keys())]
## Now do locus coverage
## For the same reason as above, do the haplotypes file first. We create the tmp_counts
## counter and then make a new counter with the index being 1/2 to account for the
## doublecounting of NaN in the haplotypes file.
## Also, note that annoyingly pandas reads in "NA" as float('nan') rather than string "NA"
## which took me a _while_ to figure out.
hap_nonzero = collections.Counter(haps_df.count())
tmp_counts = collections.Counter(hap_nonzero)
hap_counts = collections.Counter()
for x in tmp_counts:
hap_counts.update({x/2:tmp_counts[x]})
## Get monomorphics
## Get the sum of non-zero elements per row (the fancy .ix slicing drops the
## first column which is the sequence)
mono_nonzero = (mono_df.ix[:, 1:] != 0).astype(int).sum(axis=1)
mono_counts = collections.Counter(mono_nonzero)
tot_counts = hap_counts + mono_counts
## Collections are unordered, and also any coverage level with no count
## will not be in the collection, so we have to order it and insert zeros at the appropriate locations
dat = []
for i in xrange(1,13):
try:
dat.append(tot_counts[i])
except:
dat.append(0)
sim_full_loc_cov[simstring] = dat
print("Locus coverage")
print([(x,y) for x,y in sim_full_loc_cov.items() if "aftrrad" in x])
print("Sample nloci")
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "aftrrad" in x])
Doing - aftrrad-simlarge mean sample coverage aftrrad-simlarge - 100172.75 min/max - 100132/100249 Locus coverage [('aftrrad-simno', [0, 1, 17, 29, 5, 1, 3, 27, 15, 9, 36, 9910]), ('aftrrad-simlo', [0, 3, 20, 37, 7, 0, 4, 34, 18, 16, 58, 9870]), ('aftrrad-simlarge', [0, 19, 226, 838, 213, 117, 332, 2078, 1878, 2529, 10526, 81656]), ('aftrrad-simhi', [0, 11, 35, 57, 5, 1, 5, 53, 30, 22, 129, 9761])] Sample nloci [('aftrrad-simno', [10051, 10051, 10051, 10051, 10050, 10050, 10050, 10050, 10052, 10053, 10053, 10052]), ('aftrrad-simlo', [10059, 10060, 10058, 10059, 10059, 10059, 10059, 10058, 10061, 10062, 10062, 10059]), ('aftrrad-simlarge', [100138, 100141, 100137, 100135, 100141, 100134, 100137, 100132, 100244, 100244, 100249, 100241]), ('aftrrad-simhi', [10088, 10089, 10083, 10082, 10087, 10085, 10082, 10081, 10089, 10090, 10087, 10085])]
~~Here's what's weird. The dDocent reference.fasta file which is generated by the cd-hit clustering looks like it contains the right number of clusters. Similarly, if you use samtools to view the sample -RG.bam
, and the mapped.bed
file has 10000 loci as well. But if you look at the Contigs that make it to the final vcf files (either TotalRawSNPs or Final.recode) there are ~100+ contigs missing. I'm sure freebayes is filtering loci, but i diff'd the loci in the bed file and the vcf, and looked at the sequences of one of the missing loci in the cat-RRG.bam file and they look more or less normal. It's not the mapping quality flaq (-m), because all sim seqs have high quality mapping. None of the other settings in the dDocent script or the default freebayes settings seem particularly conspicuous.~~ Monomorphic loci obviously aren't going to end up in the final vcf. It is incredibly not straightforward how to actually get access to the monomorphic loci from the intermediary files that dDocent creates. The shortcuts that dDocent takes to run super fast mean it's really hard to get "stacks" of sequence data per locus (e.g. the ipyrad .loci file).
grep Contig TotalRawSNPs.vcf | cut -f 1 | uniq -c | wc -l
simno/simlo/simhi - 9887/9889/9890
For simlarge there was a much bigger difference between the TotalRaw and Final.recode vcf files (98350/91620), TotalRaw recovers about 98.4% of true loci, but the final recode catches only about 92%.
It's also not super straightforward how to get # loci per sample from the dDocent output, so I had to gin something up. Right now because it uses samtools to view the bam file and then does some shell manipulation, this is a little slow (20 seconds per simulation treatment), but tolerable.
TODO: This is currently not tracking which samples correspond with the number of loci recovered bcz the ddocent bam files use the weird naming scheme and I haven't back calculated the good names yet.
This isn't exactly technically correct, because this is the counts of loci mapped for each individual, not the number of loci for each individual in the output!
Also, this is a little slow. Takes about 30-40 minutes.
import subprocess
DDOCENT_SIMOUT=os.path.join(DDOCENT_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#for sim in ["simlo"]:
simdir = os.path.join(DDOCENT_SIMOUT, sim + "/" + sim + "_fastqs/")
print("Doing {} - {}".format(sim, simdir))
## This is hackish. You have to dip into the bam file to get locus counts that
## include counts for monomorphics. Note! that field 3 of the bam file is not
## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3')
## the read maps to. So really this is kind of by proxy counting the number of loci.
sample_coverage = []
for samp in glob.glob(simdir + "/*-RG.bam"):
cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp)
res = subprocess.check_output(cmd, shell=True)
sample_coverage.append(int(res.strip()))
print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.")
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
print("Reading VCF")
vcf_filt = pd.read_csv("{}/Final.recode.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0)
vcf_full = pd.read_csv("{}/TotalRawSNPs.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0)
for vcf_string, vcf_df in zip(["full", "filt"], [vcf_full, vcf_filt]):
simstring = "ddocent_" + vcf_string + "-" + sim
print("Doing {}".format(simstring))
## Sample coverage is the same for both conditions here, because they are based on the
## same raw data.
idxs = set(vcf_df.index)
## Here we are going to simultaneously accumulate information about nloci per sample and locus coverage counts.
c = []
sample_cov_counter = collections.Counter()
for i, idx in enumerate(idxs):
## Poor man's progress bar. For the impatient.
if not i % 1000:
print(i),
## Complicated song and dance here because if the locus only has one snp you have to do
## some bullshit to reshape the dataframe.
tmp_df = vcf_df.loc[idx]
if isinstance(tmp_df, pd.Series):
tmp_df = tmp_df.to_frame().T
## The double apply is applying split() to all rows and columns to get the genotype data
nonzero_samps = (tmp_df.iloc[:, 8:20].apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").\
astype(int).sum().nonzero()[0]
sample_cov_counter.update(nonzero_samps)
c.append(len(nonzero_samps))
## The double apply is applying split() to all rows and columns to get the genotype data
# count = len((tmp_df.iloc[:, 8:20].\
# apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").astype(int).sum().nonzero()[0])
# c.append(count)
counts = collections.Counter(c)
## Fill in zero values that aren't in the locus counter
dat = []
for i in xrange(1,13):
try:
dat.append(counts[i])
except:
dat.append(0)
sim_full_loc_cov[simstring] = dat
sim_full_sample_nlocs[simstring] = sample_cov_counter.values()
# Here's a different stupid way to do this...
#
# sample_counts = {}
# sample_names = list(vcf.columns)[8:]
# print(sample_names)
# print("num loci - {}".format(len(set(vcf.index))))
# for name in sample_names:
# print(name)
# sample_counts[name] = 0
# for locus in set(vcf.index):
# snps = vcf[name][locus]
# if any(map(lambda x: x.split(":")[0] != "./.", snps)):
# sample_counts[name] += 1
# else:
# pass
# #print("{} {} {}".format(name, locus, snps))
# print(sample_counts)
print("Locus coverage")
print([(x,y) for x,y in sim_full_loc_cov.items() if "ddocent" in x])
print("Sample nloci")
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ddocent" in x])
Doing simno - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/ This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files. mean sample coverage - 10000.0 min/max - 10000/10000 Reading VCF Doing ddocent_full-simno 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Doing ddocent_filt-simno 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Locus coverage [('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])] Sample nloci [('ddocent_full-simlarge', []), ('ddocent_filt-simlo', []), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', []), ('ddocent_full-simlo', []), ('ddocent_filt-simhi', []), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])] Doing simlo - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/ This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files. mean sample coverage - 10000.0 min/max - 10000/10000 Reading VCF Doing ddocent_full-simlo 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Doing ddocent_filt-simlo 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Locus coverage [('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])] Sample nloci [('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', []), ('ddocent_full-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simhi', []), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])] Doing simhi - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/ This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files. mean sample coverage - 9998.75 min/max - 9997/10000 Reading VCF Doing ddocent_full-simhi 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Doing ddocent_filt-simhi 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Locus coverage [('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])] Sample nloci [('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [9885, 9885, 9886, 9887, 9888, 9888, 9886, 9886, 9886, 9887, 9887, 9888]), ('ddocent_full-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simhi', [9885, 9885, 9885, 9885, 9885, 9885, 9883, 9884, 9884, 9885, 9885, 9885]), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])] Doing simlarge - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/ This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files. mean sample coverage - 96949.5 min/max - 96919/96988 Reading VCF Doing ddocent_full-simlarge 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 Locus coverage [('ddocent_full-simlarge', [1, 2, 38, 434, 141, 71, 258, 1748, 1703, 2334, 9853, 81766]), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9853, 81766]), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])] Sample nloci [('ddocent_full-simlarge', [95609, 95639, 95606, 95654, 95650, 95609, 95639, 95625, 95492, 95499, 95494, 95502]), ('ddocent_filt-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', [91091, 91119, 90741, 90312, 91116, 91065, 90744, 90308, 91090, 91095, 90682, 90212]), ('ddocent_full-simhi', [9885, 9885, 9886, 9887, 9888, 9888, 9886, 9886, 9886, 9887, 9887, 9888]), ('ddocent_full-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simhi', [9885, 9885, 9885, 9885, 9885, 9885, 9883, 9884, 9884, 9885, 9885, 9885]), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])]
This data is worth its weight in gold, it takes forever to acquire, so lets save it off to disk so we don't lose it.
import pickle
sim_res_dict = {"sim_sample_nlocs_df":sim_full_sample_nlocs, "sim_full_loc_cov":sim_full_loc_cov}
for k,v in sim_res_dict.items():
print(k)
pickle.dump(v, open(WORK_DIR + "RESULTS/" + k + ".p", 'wb'))
sim_full_loc_cov sim_sample_nlocs_df
simlevel = ["simno", "simlo", "simhi", "simlarge"]
assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\
"stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"]
sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel)
for sim in simlevel:
for ass in assemblers:
simstring = "-".join([ass, sim])
sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring])
print("Mean number of loci recovered per sample.")
## Normalize all bins
dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float)
for sim in simlevel:
scale = 10000
if sim == "simlarge":
scale = 100000
dat[sim] = dat[sim]/scale
sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True)
print(sim_sample_nlocs_df)
#print(dat)
Mean number of loci recovered per sample. simno simlo simhi simlarge ipyrad 10000 10000 9999.92 96942.7 pyrad 10000 10000 9999.92 96943.7 stacks_gapped 11422.7 11393.8 11364.8 110707 stacks_ungapped 11422.7 11423.9 11420.8 110707 stacks_default 11079.2 11091.1 11101.2 107224 aftrrad 10051.2 10059.6 10085.7 100173 ddocent_full 9886 9888 9886.58 95584.8 ddocent_filt 9886 9888 9884.67 90797.9
simlevel = ["simno", "simlo", "simhi", "simlarge"]
assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\
"stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"]
df = pd.DataFrame(index=assemblers, columns=simlevel)
df2 = pd.DataFrame(columns = ["assembler", "simtype"] + [x for x in xrange(1,13)])
df3 = pd.DataFrame(columns = ["assembler", "simtype", "simdata"])
## Try to get a dataframe in the shape seaborn will be happy with
dfsns = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])
for sim in simlevel:
for ass in assemblers:
simstring = ass + "-" + sim
df[sim][ass] = np.array(sim_full_loc_cov[simstring])
df2.loc[simstring] = [ass, sim] + sim_full_loc_cov[simstring]
df3.loc[simstring] = [ass, sim, np.array(sim_full_loc_cov[simstring])]
for i, val in enumerate(sim_full_loc_cov[simstring]):
## Normalize values so different sim sizes print the same
max = 10000.
if "large" in simstring:
max = 100000.
dfsns.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val/max]
sns.set(style="white")
## This kinda sucks, too spread out.
#g = sns.FacetGrid(dfsns, row="assembler", col="simtype", margin_titles=True)
#g.map(sns.barplot, "bin", "simdata", color="steelblue", lw=0)
plt.rcParams['figure.figsize']=(20,20)
g = sns.FacetGrid(dfsns, col="simtype", margin_titles=True, col_wrap=2, size=4, aspect=2)
g.map(sns.barplot, "bin", "simdata", "assembler", lw=0.5, palette="bright").add_legend()
<seaborn.axisgrid.FacetGrid at 0x2aab7b71a7d0>
## Barplots of assembly method x simdata type
g = sns.FacetGrid(dfsns, col="assembler", margin_titles=True, col_wrap=4, size=3, aspect=1.5)
g.map(sns.barplot, "bin", "simdata", "simtype", lw=0.5, palette="bright").add_legend()
<seaborn.axisgrid.FacetGrid at 0x2aab7b85fcd0>
It just looks nicer, makes more sense.
dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])
for sim in simlevel:
for ass in assemblers:
simstring = ass + "-" + sim
## Normalize values so different sim sizes print the same
max = 10000.
if "large" in simstring:
max = 100000.
newdat = sim_full_loc_cov[simstring]
newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)]
for i, val in enumerate(newdat):
dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val]
g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", sharex=True, sharey=False, size=4, col_wrap=2)
g.map(plt.scatter, "bin", "simdata")
g.map(plt.plot, "bin", "simdata").add_legend()
axs = g.axes
for ax in axs:
ax.set_xlim(0,12.5)
Here we'll pull together all the output vcf files and filter out everything except biallelic snps. This takes a few minutes to run.
vcf_dict = {}
vcf_dict["ipyrad"] = os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA.vcf")
vcf_dict["pyrad"] = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf")
vcf_dict["stacks_ungapped"] = os.path.join(STACKS_DIR, "REALDATA/ungapped/batch_1.vcf")
vcf_dict["stacks_gapped"] = os.path.join(STACKS_DIR, "REALDATA/gapped/batch_1.vcf")
vcf_dict["stacks_default"] = os.path.join(STACKS_DIR, "REALDATA/default/batch_1.vcf")
vcf_dict["aftrrad"] = os.path.join(AFTRRAD_DIR, "REALDATA/Formatting/REALDATA.vcf")
vcf_dict["ddocent_full"] = os.path.join(DDOCENT_DIR, "REALDATA/TotalRawSNPs.vcf")
vcf_dict["ddocent_filt"] = os.path.join(DDOCENT_DIR, "REALDATA/Final.recode.vcf")
for k, f in vcf_dict.items():
if os.path.exists(f):
print("found - {}".format(f))
## If it's gzipped then unzip it (only applies to ipyrad i think)
if ".gz" in f:
print("gunzipping")
cmd = "gunzip {}".format(f)
!cmd
vcf_dict[k] = f.split(".gz")[0]
## Remove all but biallelic (for ipyrad this also removes all the monomorphic)
#outfile = f.split(".vcf")[0]+"-biallelic"
#cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \
# .format(DDOCENT_DIR, f, outfile)
#print(cmd)
#!$cmd
## update the vcf_dict
#vcf_dict[k] = outfile + ".recode.vcf"
else:
print("not found - {}".format(f))
found - /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1.vcf /home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1-biallelic VCFtools - v0.1.11 (C) Adam Auton 2009 Parameters as interpreted: --vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1.vcf --max-alleles 2 --min-alleles 2 --out /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1-biallelic --recode Reading Index file. File contains 149570 entries and 13 individuals. Applying Required Filters. Filtering sites by number of alleles After filtering, kept 13 out of 13 Individuals After filtering, kept 149570 out of a possible 149570 Sites Outputting VCF file... Done Run Time = 5.00 seconds found - /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs.vcf /home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs-biallelic VCFtools - v0.1.11 (C) Adam Auton 2009 Parameters as interpreted: --vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs.vcf --max-alleles 2 --min-alleles 2 --out /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs-biallelic --recode Reading Index file. File contains 238142 entries and 13 individuals. Applying Required Filters. Filtering sites by number of alleles After filtering, kept 13 out of 13 Individuals After filtering, kept 190405 out of a possible 238142 Sites Outputting VCF file... Done Run Time = 15.00 seconds found - /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3.vcf /home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3-biallelic VCFtools - v0.1.11 (C) Adam Auton 2009 Parameters as interpreted: --vcf /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3.vcf --max-alleles 2 --min-alleles 2 --out /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3-biallelic --recode Reading Index file. File contains 319936 entries and 13 individuals. Applying Required Filters. Filtering sites by number of alleles After filtering, kept 13 out of 13 Individuals After filtering, kept 308247 out of a possible 319936 Sites Outputting VCF file... Done Run Time = 5.00 seconds not found - /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf found - /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA.vcf /home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA-biallelic VCFtools - v0.1.11 (C) Adam Auton 2009 Parameters as interpreted: --vcf /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA.vcf --max-alleles 2 --min-alleles 2 --out /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA-biallelic --recode Reading Index file. File contains 4554084 entries and 13 individuals. Applying Required Filters. Filtering sites by number of alleles After filtering, kept 13 out of 13 Individuals After filtering, kept 220118 out of a possible 4554084 Sites Outputting VCF file... Done Run Time = 17.00 seconds not found - /home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/batch_1.vcf found - /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode.vcf /home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode-biallelic VCFtools - v0.1.11 (C) Adam Auton 2009 Parameters as interpreted: --vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode.vcf --max-alleles 2 --min-alleles 2 --out /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode-biallelic --recode Reading Index file. File contains 99971 entries and 13 individuals. Applying Required Filters. Filtering sites by number of alleles After filtering, kept 13 out of 13 Individuals After filtering, kept 80040 out of a possible 99971 Sites Outputting VCF file... Done Run Time = 6.00 seconds found - /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1.vcf /home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1-biallelic VCFtools - v0.1.11 (C) Adam Auton 2009 Parameters as interpreted: --vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1.vcf --max-alleles 2 --min-alleles 2 --out /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1-biallelic --recode Reading Index file. Building new index file. Scanning Chromosome: un Writing Index file. File contains 344633 entries and 13 individuals. Applying Required Filters. Filtering sites by number of alleles After filtering, kept 13 out of 13 Individuals After filtering, kept 344633 out of a possible 344633 Sites Outputting VCF file... Done Run Time = 11.00 seconds
emp_loc_cov = collections.OrderedDict()
emp_snp_cov = collections.OrderedDict()
emp_sample_nsnps = collections.OrderedDict()
emp_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in vcf_dict.items():
try:
print("Doing - {}".format(prog))
print("\t{}".format(filename))
v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
emp_loc_cov[prog] = loci_coverage(v, c, prog)
emp_snp_cov[prog] = snp_coverage(c)
emp_sample_nsnps[prog] = sample_nsnps(c)
emp_sample_nlocs[prog] = sample_nloci(v, c, prog)
plotPCA(c, prog)
plotPairwiseDistance(c, prog)
except Exception as inst:
print(inst)
[vcfnp] 2016-10-18 11:08:57.760390 :: caching is disabled [vcfnp] 2016-10-18 11:08:57.760901 :: building array
Doing - stacks_default /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1-biallelic.recode.vcf
[vcfnp] 2016-10-18 11:08:59.403870 :: caching is disabled [vcfnp] 2016-10-18 11:08:59.404332 :: building array [vcfnp] 2016-10-18 11:09:19.246532 :: caching is disabled [vcfnp] 2016-10-18 11:09:19.247164 :: building array
Doing - ddocent_full /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs-biallelic.recode.vcf
[vcfnp] 2016-10-18 11:09:25.020836 :: caching is disabled [vcfnp] 2016-10-18 11:09:25.021373 :: building array [vcfnp] 2016-10-18 11:10:08.979173 :: caching is disabled [vcfnp] 2016-10-18 11:10:08.979877 :: building array
Doing - pyrad /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
[vcfnp] 2016-10-18 11:10:12.003180 :: caching is disabled [vcfnp] 2016-10-18 11:10:12.003658 :: building array [vcfnp] 2016-10-18 11:10:36.855671 :: caching is disabled [vcfnp] 2016-10-18 11:10:36.856239 :: building array
Doing - aftrrad /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf file not found: /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf Doing - ipyrad /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA-biallelic.recode.vcf
[vcfnp] 2016-10-18 11:10:39.041659 :: caching is disabled [vcfnp] 2016-10-18 11:10:39.042141 :: building array [vcfnp] 2016-10-18 11:10:57.915668 :: caching is disabled [vcfnp] 2016-10-18 11:10:57.916290 :: building array
Doing - stacks_gapped /home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/batch_1.vcf file not found: /home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/batch_1.vcf Doing - ddocent_filt /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode-biallelic.recode.vcf
[vcfnp] 2016-10-18 11:11:00.415481 :: caching is disabled [vcfnp] 2016-10-18 11:11:00.416015 :: building array [vcfnp] 2016-10-18 11:11:19.294158 :: caching is disabled [vcfnp] 2016-10-18 11:11:19.294788 :: building array
Doing - stacks_ungapped /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1-biallelic.recode.vcf
[vcfnp] 2016-10-18 11:11:23.090947 :: caching is disabled [vcfnp] 2016-10-18 11:11:23.091441 :: building array
for i in emp_sample_nlocs:
print(i),
print(emp_sample_nlocs[i])
stacks_default [22215, 32074, 25377, 16640, 23557, 17426, 32172, 34354, 31333, 29809, 34792, 24791, 25256] ddocent_full [34816, 37668, 35523, 25583, 36635, 25751, 38051, 39179, 38946, 38744, 39209, 37612, 37240] pyrad [22826, 34866, 30179, 19710, 19076, 21977, 36830, 36774, 37212, 29720, 38012, 35350, 31155] ipyrad [18710, 28396, 24635, 15667, 15301, 17830, 29885, 29492, 30213, 24043, 30671, 29076, 25580] ddocent_filt [18640, 18904, 18745, 18471, 18713, 18553, 18969, 19050, 19096, 19068, 19086, 19182, 19164] stacks_ungapped [23103, 35494, 30792, 19377, 19547, 21469, 36663, 36949, 36703, 29789, 37968, 33544, 31931]
First we load in the variant info and call data for all the snps
IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/")
IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
[21163, 31863, 27902, 20852, 17779, 23008, 33289, 33697, 35765, 29184, 35096, 38144, 34746] mean sample coverage - 29422.1538462 min/max - 17779/38144
Read the biallelic vcf
filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf")
# filename = vcf_dict["ipyrad"]
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
[vcfnp] 2016-10-12 12:42:26.099077 :: caching is disabled [vcfnp] 2016-10-12 12:42:26.099836 :: building array [vcfnp] 2016-10-12 12:42:29.441796 :: caching is disabled [vcfnp] 2016-10-12 12:42:29.442748 :: building array
Getting variable sites and parsimony informative sites from the vcf is kind of annoying because all the programs export slightly different formats, so you need to parse them in slightly different ways. There's a better way to do this for ipyrad but i figure i'll do it the same way for all of them so it's more clear what's happening.
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
## Have to gunzip the ipyrad REALDATA vcf
plotPCA(c, "ipyrad")
plotPairwiseDistance(c, "pyrad")
PYRAD_EMPIRICAL_OUTPUT=os.path.join(PYRAD_DIR, "REALDATA/")
PYRAD_STATS = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "stats/c85d6m2p3H3N3.stats")
infile = open(PYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[8:21]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
[23924, 37002, 31852, 23698, 20063, 26050, 38931, 39423, 41250, 33230, 40809, 42948, 38645] mean sample coverage - 33678.8461538 min/max - 20063/42948
Be careful because sometimes the pyrad vcf gets a newline inserted between the last line of metadata and the first line of genotypes, which causes vcfnp to silently fail. Also there are a couple flags you can pass to vcfnp for debugging, but the print hella data to the console.
v = vcfnp.variants(filename, progress=1, verbose=True).view(np.recarray)
filename = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
[vcfnp] 2016-10-08 15:40:30.499958 :: caching is disabled [vcfnp] 2016-10-08 15:40:30.500883 :: building array [vcfnp] 2016-10-08 15:40:37.399061 :: caching is disabled [vcfnp] 2016-10-08 15:40:37.417059 :: building array
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/")
STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/")
STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/")
STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/")
#lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
#cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
#shigh = [cnts.count(i) for i in range(1,13)]
filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
[vcfnp] 2016-10-12 10:48:17.512074 :: caching is disabled [vcfnp] 2016-10-12 10:48:17.512879 :: building array [vcfnp] 2016-10-12 10:48:25.055213 :: caching is disabled [vcfnp] 2016-10-12 10:48:25.061273 :: building array
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
#toyplot.html.render(canvas, 'snp_positions.html')
canvas
plotPCA(c, "stacks ungapped")
plotPairwiseDistance(c, "stacks ungapped")
[vcfnp] 2016-09-03 19:04:05.829749 :: caching is disabled [vcfnp] 2016-09-03 19:04:05.830278 :: building array
filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
[vcfnp] 2016-10-12 10:53:17.257960 :: caching is disabled [vcfnp] 2016-10-12 10:53:17.259180 :: building array [vcfnp] 2016-10-12 10:53:20.403885 :: caching is disabled [vcfnp] 2016-10-12 10:53:20.409034 :: building array
TODO
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]
## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
toyplot.html.render(canvas, 'snp_positions.html')
canvas
plotPCA(c, "stacks default")
plotPairwiseDistance(c, "stacks default")
[vcfnp] 2016-09-03 18:56:31.855138 :: caching is disabled [vcfnp] 2016-09-03 18:56:31.855756 :: building array
<matplotlib.axes._subplots.AxesSubplot at 0x2aab15d09e90>
AFTRRAD_OUTPUT=os.path.join(AFTRRAD_DIR, "REALDATA/")
filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
NOT TESTED
It's not very straightforward how to extract information about whether each snp is informative or not, given the output files that aftrRAD generates. It does create a file with all the snp locations so we can use this to generate the distribution of variable sites, but the pis will be empty.
f = os.path.join(AFTRRAD_OUTPUT, "TempFiles/SNPLocationsToPlot.txt")
with open(f) as infile:
snplocs = infile.read().split()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([])
## Getting the distvar is easier
distvar = Counter(map(int, snplocs))
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
## toyplot.html.render(canvas, 'snp_positions.html')
canvas
HAVE TO MAKE SURE THE SAMPLES IN THE DDOCENT OUTPUT VCF ARE IN THE SAME ORDER AS THOSE IN THE IPYRAD/STACKS There is now code in the horserace notebook to rename samples in the ddocent vcf and to reorder the samples to match ipyrad/stacks.
import subprocess
DDOCENT_OUTPUT=os.path.join(DDOCENT_DIR, "REALDATA/")
os.chdir(DDOCENT_OUTPUT)
print("Getting max loci per sample from bam files.")
sample_coverage = []
for samp in glob.glob("*-RG.bam"):
cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp)
res = subprocess.check_output(cmd, shell=True)
print("nloci {} - {}".format(res.strip(), samp))
sample_coverage.append(int(res.strip()))
print(sorted(sample_coverage, reverse=True))
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
#sim_coverage_df["ddocent_"+sim] = sample_coverage
vcf_filt = pd.read_csv("Final.recode.vcf", delim_whitespace=True, header=60, index_col=0)
vcf_full = pd.read_csv("TotalRawSNPs.vcf", delim_whitespace=True, header=60, index_col=0)
## There's gotta be a faster way to do this...
## It's really fuckin slow on real data, like hours.
for vcf in [vcf_full, vcf_filt]:
sample_counts = {}
sample_names = list(vcf.columns)[8:]
print(sample_names)
print("num loci in vcf - {}".format(len(set(vcf.index))))
for name in sample_names:
print(name)
sample_counts[name] = 0
for locus in set(vcf.index):
snps = vcf[name][locus]
if any(map(lambda x: x.split(":")[0] != "./.", snps)):
sample_counts[name] += 1
else:
pass
print(sample_counts)
print(sim_coverage_df)
Getting max loci per sample from bam files. nloci 37358 - Pop1_Sample1-RG.bam nloci 41001 - Pop3_Sample1-RG.bam nloci 29663 - Pop2_Sample2-RG.bam nloci 42706 - Pop3_Sample2-RG.bam nloci 29490 - Pop2_Sample1-RG.bam nloci 38202 - Pop4_Sample2-RG.bam nloci 42543 - Pop3_Sample3-RG.bam nloci 42667 - Pop3_Sample4-RG.bam nloci 42999 - Pop3_Sample5-RG.bam nloci 43950 - Pop4_Sample1-RG.bam nloci 44300 - Pop4_Sample3-RG.bam nloci 40740 - Pop5_Sample1-RG.bam nloci 39368 - Pop5_Sample2-RG.bam [44300, 43950, 42999, 42706, 42667, 42543, 41001, 40740, 39368, 38202, 37358, 29663, 29490] mean sample coverage - 39614.3846154 min/max - 29490/44300
## Basic difference btw Final.recode and TotalRawSNPs is that final recode
## only includes individuals with snps called in > 90% of samples
f1 = os.path.join(DDOCENT_OUTPUT, "TotalRawSNPs.vcf")
v_full = vcfnp.variants(f1, dtypes={"CHROM":"a24"}).view(np.recarray)
c_full = vcfnp.calldata_2d(f1).view(np.recarray)
f2 = os.path.join(DDOCENT_OUTPUT, "Final.recode.vcf")
v_filt = vcfnp.variants(f2, dtypes={"CHROM":"a24"}).view(np.recarray)
c_filt = vcfnp.calldata_2d(f2).view(np.recarray)
[vcfnp] 2016-10-15 15:13:53.339632 :: caching is disabled [vcfnp] 2016-10-15 15:13:53.340435 :: building array [vcfnp] 2016-10-15 15:14:20.818988 :: caching is disabled [vcfnp] 2016-10-15 15:14:20.832036 :: building array [vcfnp] 2016-10-15 15:15:48.271026 :: caching is disabled [vcfnp] 2016-10-15 15:15:48.272360 :: building array [vcfnp] 2016-10-15 15:15:59.214829 :: caching is disabled [vcfnp] 2016-10-15 15:15:59.224579 :: building array
for dset in [[v_full, c_full], [v_filt, c_filt]]:
## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), dset[1]["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(dset[0]["POS"]), mask=only_pis).compressed()
## Now have to massage this into the list of counts per site in increasing order
## of position across the locus
distpis = Counter([int(x) for x in pis])
## Getting the distvar is easier
distvar = Counter([int(x) for x in v_filt.POS])
canvas, axes = SNP_position_plot(distvar, distpis)
## save fig
## toyplot.html.render(canvas, 'snp_positions.html')
canvas
#Tmp delete me
print(np.sum(distvar.values()))
print(np.sum(distpis.values()))
99971 60996
for dset in [[v_full, c_full], [v_filt, c_filt]]:
plotPCA(dset[1], "ddocent")
plotPairwiseDistance(c_full, "dDocent Full")
plotPairwiseDistance(c_filt, "dDocent Filtered")
## Testing for different vcf formats
prog = "stacks-simhi"
filename = "/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1.vcf"
#prog = "dDocent-simhi"
#filename = "/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs.vcf"
#prog = "ipyrad-simlarge"
#filename = "/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf"
prog = "pyrad-simhi"
filename = "/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3.vcf"
v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)
print("loc_cov", loci_coverage(v, c, prog))
print("snp_cov", snp_coverage(c))
print("snps", sample_nsnps(c))
print("nlocs", sample_nloci(v, c, prog))
[vcfnp] 2016-10-17 14:32:50.401685 :: caching is disabled [vcfnp] 2016-10-17 14:32:50.402203 :: building array [vcfnp] 2016-10-17 14:32:50.880046 :: caching is disabled [vcfnp] 2016-10-17 14:32:50.880506 :: building array
('loc_cov', [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9881]) ('snp_cov', [1, 10, 6, 6, 1, 14, 0, 10, 7, 6, 61, 43054]) ('snps', [43139, 43139, 43135, 43141, 43134, 43139, 43139, 43139, 43144, 43141, 43143, 43141]) ('nlocs', [9883, 9883, 9883, 9883, 9883, 9882, 9883, 9883, 9883, 9883, 9883, 9883])
## Maybe useful
print(v.ID)
print(len(v.ID))
print(len(set([x.split("_")[0] for x in v.ID])))
['1_6' '1_14' '1_27' ..., '614361_45' '614386_71' '614406_58'] 344633 76984
import itertools
import numpy.ma as ma
print(v.ID)
print(len(v.ID))
loc_list = []
loci = set([x.split("_")[0] for x in v.ID])
print(len(loci))
loc_list = [list(val) for key,val in itertools.groupby(v.ID,key=lambda x:x.split("_")[0])]
print(len(loc_list))
## experimentation
print(v.dtype)
print(c.dtype)
print(np.mean(c.DP))
print(loc_list[0:10])
subsamp = np.array([np.random.choice(x, 1, replace=False)[0] for x in loc_list])
mask = np.in1d(v.ID, subsamp)
['4_35' '4_62' '5_56' ..., '496594_58' '496602_66' '496604_59'] 149570 82955 82955 (numpy.record, [('CHROM', 'S12'), ('POS', '<i4'), ('ID', 'S12'), ('REF', 'S12'), ('ALT', 'S12'), ('QUAL', '<f4'), ('FILTER', [('PASS', '?')]), ('num_alleles', 'u1'), ('is_snp', '?'), ('svlen', '<i4'), ('AF', '<f2'), ('NS', '<i4')]) (numpy.record, [('is_called', '?'), ('is_phased', '?'), ('genotype', 'i1', (2,)), ('AD', '<u2', (2,)), ('DP', '<u2'), ('GL', '<f4'), ('GT', 'S3')]) 5.95418198837 [['4_35', '4_62'], ['5_56', '5_62', '5_68'], ['6_61', '6_68'], ['7_67'], ['8_5', '8_41', '8_65'], ['11_40'], ['13_49', '13_71'], ['15_6', '15_10', '15_24', '15_34'], ['16_26'], ['17_17', '17_26', '17_27', '17_66']]
g = allel.GenotypeArray(cmask.genotype)
gn = g.to_n_alt()
m = allel.stats.rogers_huff_r(gn[:1000]) ** 2
ax = allel.plot.pairwise_ld(m)
## Some crap from when i was counting loci a very stupid and slow way for stacks.
### This doesn't' count monomorphics, but could be useful still. It's dog slow.
#vcf = pd.read_csv("{}/batch_1.vcf".format(simdir), delim_whitespace=True, header=9, index_col=2)
#sample_counts = {}
#sample_names = list(vcf.columns)[8:]
#print(sample_names)
#uniq_loci = set([x.split("_")[0] for x in vcf.index])
#print("num loci - {}".format(len(uniq_loci)))
## Replace the ID index in the stacks vcf file so we can actually evaluate loci
#vcf.index = pd.Index([x.split("_")[0] for x in vcf.index])
#for name in sample_names:
# print(name),
# sample_counts[name] = 0
# for locus in uniq_loci:
# snps = vcf[name][locus]
# if any(map(lambda x: x.split(":")[0] != "./.", snps)):
# sample_counts[name] += 1
# else:
# pass
# #print("{} {} {}".format(name, locus, snps))
#print(sample_counts)
#sim_full_sample_nlocs["stacks_"+stacks_method+"-"+sim] = sample_coverage
filename = os.path.join(DDOCENT_OUTPUT, "Final.recode.snps.vcf")
v_filt = vcfnp.variants(filename).view(np.recarray)
c_filt = vcfnp.calldata_2d(filename).view(np.recarray)
[vcfnp] 2016-10-06 21:43:30.965523 :: caching is disabled [vcfnp] 2016-10-06 21:43:30.966488 :: building array [vcfnp] 2016-10-06 21:43:42.021230 :: caching is disabled [vcfnp] 2016-10-06 21:43:42.029759 :: building array
from collections import Counter
print(v_filt.dtype)
#wat = Counter(vcall["POS"])
#wat
## DDOCENT
v_filt["is_snp"]
print(v_filt[0:2])
print(v_filt["NS"][:40])
print(v_filt["NUMALT"][:40])
print(v_filt["ALT"][:40])
loci = set(v_filt["CHROM"])
(numpy.record, [('CHROM', 'S24'), ('POS', '<i4'), ('ID', 'S12'), ('REF', 'S12'), ('ALT', 'S12'), ('QUAL', '<f4'), ('FILTER', [('PASS', '?')]), ('num_alleles', 'u1'), ('is_snp', '?'), ('svlen', '<i4'), ('AB', '<f4'), ('ABP', '<f4'), ('AC', '<u2'), ('AF', '<f2'), ('AN', '<u2'), ('AO', '<i4'), ('CIGAR', 'S12'), ('DP', '<i4'), ('DPB', '<f4'), ('DPRA', '<f4'), ('END', '<i4'), ('EPP', '<f4'), ('EPPR', '<f4'), ('GTI', '<i4'), ('LEN', '<i4'), ('MEANALT', '<f4'), ('MIN_DP', '<i4'), ('MQM', '<f4'), ('MQMR', '<f4'), ('NS', '<i4'), ('NUMALT', '<i4'), ('ODDS', '<f4'), ('PAIRED', '<f4'), ('PAIREDR', '<f4'), ('PAO', '<f4'), ('PQA', '<f4'), ('PQR', '<f4'), ('PRO', '<f4'), ('QA', '<i4'), ('QR', '<i4'), ('RO', '<i4'), ('RPL', '<f4'), ('RPP', '<f4'), ('RPPR', '<f4'), ('RPR', '<f4'), ('RUN', '<i4'), ('SAF', '<i4'), ('SAP', '<f4'), ('SAR', '<i4'), ('SRF', '<i4'), ('SRP', '<f4'), ('SRR', '<i4'), ('TYPE', 'S12'), ('technology.Illumina', '<f4')]) [ ('dDocent_Contig_1', 7, '.', 'T', 'A', 9108.900390625, (False,), 3, True, 0, 0.5513780117034912, 12.15880012512207, 15, 0.5771484375, 26, 286, '1X', 466, 466.0, 0.0, 0, 624.051025390625, 317.8739929199219, 0, 1, 1.1538499593734741, 0, 55.307701110839844, 60.0, 13, 2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 11292, 5756, 145, 0.0, 624.051025390625, 317.8739929199219, 286.0, 1, 286, 624.051025390625, 0, 145, 317.8739929199219, 0, 'snp', 1.0) ('dDocent_Contig_1', 14, '.', 'CTT', 'CTA', 10709.2001953125, (False,), 3, False, 0, 0.6491230130195618, 80.07849884033203, 16, 0.615234375, 26, 325, '2M1X', 466, 466.0, 0.0, 0, 708.739013671875, 265.75799560546875, 0, 1, 1.2307699918746948, 0, 54.60309982299805, 60.0, 13, 2, 2.7755598776686065e-17, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12818, 4766, 121, 0.0, 708.739013671875, 265.75799560546875, 325.0, 1, 325, 708.739013671875, 0, 121, 265.75799560546875, 0, 'snp', 1.0)] [13 13 13 13 13 13 13 13 13 13 12 12 13 13 13 12 12 12 12 12 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13] [2 2 2 1 2 1 1 1 1 3 1 1 2 2 9 2 1 1 1 1 1 2 1 3 1 3 1 1 1 1 2 1 1 3 2 2 1 2 1 1] ['A' 'CTA' 'C' 'T' 'CGTG' 'A' 'C' 'T' 'A' 'AAAACAAC' 'GAC' 'T' 'A' 'AATAT' 'TGCGTTCAACGA' 'GT' 'ACGT' 'G' 'G' 'C' 'T' 'CACTAT' 'C' 'ATGAAG' 'C' 'AATTTATGTATA' 'C' 'T' 'T' 'G' 'CATTCTG' 'T' 'A' 'GATATATATG' 'AGAT' 'ATCTT' 'T' 'CGAGTTATT' 'T' 'A'] 19843
print(c.dtype)
print(len(c))
print(c["GT"][0:10][:,0])
print((c["GT"][0:10000][:,0] == "./.").sum())
print(c["GT"][0:5] != "./.")
print((x != "./.").sum() for x in c["GT"][0:5])
from collections import Counter
loci_coverage = [x.sum() for x in c_filt["GT"].T != "./."]
print(loci_coverage)
(numpy.record, [('is_called', '?'), ('is_phased', '?'), ('genotype', 'i1', (2,)), ('CATG', 'S12'), ('GT', 'S3')]) 220118 ['./.' './.' './.' './.' './.' './.' './.' './.' '0/0' '0/0'] 5811 [[False False False False False False False True False False False True False] [False False False False False False False True False False False True False] [False False False False True False False False True False False False False] [False False False False True False False False True False False False False] [False False False False True False False False True False False False False]] <generator object <genexpr> at 0x2aab77fca870> [96757, 98257, 97393, 95620, 97197, 95823, 98629, 98982, 99257, 99071, 99157, 99736, 99578]
#x = sim_loc_cov
#x = sim_snp_cov
#x = sim_sample_nsnps
x = sim_sample_nlocs
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
try:
print(prog+sim+"\t"),
print(x[prog+sim]),
print(np.mean(x[prog+sim]))
except:
print("No {}".format(prog+sim))
print("------------------------------------------------------")
ipyrad-simno [9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870] 9870.0 pyrad-simno [9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894] 9894.0 stacks_ungapped-simno [9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75 stacks_gapped-simno [9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75 stacks_defualt-simno [9320, 9303, 9253, 9096, 9042, 9021, 8958, 8934, 8846, 8823, 8767, 8687] 9004.16666667 ddocent_filt-simno [9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0 ddocent_full-simno [9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0 aftrrad-simno No aftrrad-simno ------------------------------------------------------ ipyrad-simlo [9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859] 9859.0 pyrad-simlo [9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884] 9884.0 stacks_ungapped-simlo [9819, 9808, 9818, 9812, 9832, 9825, 9826, 9801, 9811, 9808, 9817, 9801] 9814.83333333 stacks_gapped-simlo [1512, 1503, 1507, 1503, 1507, 1504, 1508, 1508, 1507, 1506, 1508, 1506] 1506.58333333 stacks_defualt-simlo [9275, 9254, 9207, 9055, 9020, 8993, 8929, 8888, 8807, 8779, 8741, 8645] 8966.08333333 ddocent_filt-simlo [9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0 ddocent_full-simlo [9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0 aftrrad-simlo No aftrrad-simlo ------------------------------------------------------ ipyrad-simhi [9844, 9844, 9844, 9844, 9844, 9843, 9844, 9844, 9844, 9844, 9844, 9844] 9843.91666667 pyrad-simhi [9877, 9877, 9877, 9877, 9877, 9876, 9877, 9877, 9877, 9877, 9877, 9877] 9876.91666667 stacks_ungapped-simhi [9741, 9769, 9746, 9746, 9756, 9747, 9756, 9745, 9727, 9731, 9717, 9715] 9741.33333333 stacks_gapped-simhi [60, 60, 59, 60, 60, 60, 60, 60, 60, 60, 59, 58] 59.6666666667 stacks_defualt-simhi [9187, 9208, 9138, 8985, 8955, 8925, 8878, 8842, 8737, 8716, 8648, 8582] 8900.08333333 ddocent_filt-simhi [9848, 9848, 9848, 9848, 9848, 9848, 9846, 9847, 9847, 9848, 9848, 9848] 9847.66666667 ddocent_full-simhi [9848, 9848, 9848, 9850, 9850, 9850, 9848, 9848, 9849, 9850, 9850, 9850] 9849.08333333 aftrrad-simhi No aftrrad-simhi ------------------------------------------------------ ipyrad-simlarge [95399, 95428, 95405, 95456, 95450, 95412, 95437, 95418, 95283, 95294, 95288, 95302] 95381.0 pyrad-simlarge [95646, 95675, 95649, 95708, 95695, 95659, 95686, 95672, 95543, 95552, 95544, 95558] 95632.25 stacks_ungapped-simlarge [95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5 stacks_gapped-simlarge [95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5 stacks_defualt-simlarge [90013, 89989, 89284, 87596, 87548, 87334, 86818, 86236, 85427, 85225, 84641, 83680] 86982.5833333 ddocent_filt-simlarge [90677, 90706, 90329, 89908, 90701, 90653, 90336, 89898, 90677, 90684, 90271, 89800] 90386.6666667 ddocent_full-simlarge [95169, 95200, 95167, 95218, 95203, 95164, 95198, 95177, 95048, 95056, 95049, 95052] 95141.75 aftrrad-simlarge No aftrrad-simlarge ------------------------------------------------------
if True:
keys = ["e", "b", "b", "c", "d", "e", "e", 'a']
values = [1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1]
print('two methods of splitting by group')
print('as list')
for k,v in zip(*npi.group_by(keys)(values)):
print(k, v)
print('as iterable')
g = npi.group_by(keys)
for k,v in zip(g.unique, g.split_sequence_as_iterable(values)):
print(k, list(v))
print('iterable as iterable')
for k, v in zip(g.unique, g.split_iterable_as_iterable(values)):
print(k, list(v))
two methods of splitting by group as list ('a', array([ 1.])) ('b', array([ 4.5, 4.3])) ('c', array([ 2.])) ('d', array([ 5.67])) ('e', array([ 1.2 , 8.08, 9.01])) as iterable [1 2 1 1 3] ('a', [1]) ('b', [4.5, 4.3]) ('c', [2.0]) ('d', [5.67]) ('e', [1.2, 8.08, 9.01]) iterable as iterable ('a', [1]) ('b', [4.5, 4.3]) ('c', [2.0]) ('d', [5.67]) ('e', [1.2, 8.08, 9.01])