Notebook

dDocent simulated & emprical analyses¶

In [8]:

## Prereqs for plotting and analysis
#!conda install matplotlib
#!conda install libgcc  # for vcfnp
#!pip install vcfnp
#!pip install scikit-allel
#!conda install -y seaborn
#!pip install toyplot
#!pip install numpy_indexed

Collecting numpy-indexed
Collecting future (from numpy-indexed)
  Downloading future-0.16.0.tar.gz (824kB)
    100% |████████████████████████████████| 829kB 426kB/s 
Requirement already satisfied (use --upgrade to upgrade): pyyaml in /home/iovercast/opt/miniconda/lib/python2.7/site-packages (from numpy-indexed)
Building wheels for collected packages: future
  Running setup.py bdist_wheel for future ... - \ | / - \ | done
  Stored in directory: /home/iovercast/.cache/pip/wheels/c2/50/7c/0d83b4baac4f63ff7a765bd16390d2ab43c93587fac9d6017a
Successfully built future
Installing collected packages: future, numpy-indexed
Successfully installed future-0.16.0 numpy-indexed-0.3.4
You are using pip version 8.1.1, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [1]:

## Imports and working/output directories directories

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [12,9]

from collections import Counter
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
import toyplot
import toyplot.html    ## toypot sublib for saving html plots
import pandas as pd
import numpy as np
import collections
import allel
import vcfnp
import shutil
import glob
import os
from allel.util import * ## for ensure_square()

## Set the default directories for exec and data. 
WORK_DIR="/home/iovercast/manuscript-analysis/"
EMPERICAL_DATA_DIR=os.path.join(WORK_DIR, "example_empirical_rad/")
IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/")
PYRAD_DIR=os.path.join(WORK_DIR, "pyrad/")
STACKS_DIR=os.path.join(WORK_DIR, "stacks/")
AFTRRAD_DIR=os.path.join(WORK_DIR, "aftrRAD/")
DDOCENT_DIR=os.path.join(WORK_DIR, "dDocent/")

## tmp for testing
#IPYRAD_OUTPUT = "./arch/ipyrad/REALDATA/"
#PYRAD_OUTPUT = "./arch/pyrad/REALDATA/"

os.chdir(WORK_DIR)

Get sample names, species names, and the species_dict¶

Next will do some housekeeping to get our empirical samples sorted into species.

In [2]:

## Get sample names and assign them to a species dict
IPYRAD_STATS=os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_names = [x.strip().split()[0] for x in infile[20:33]]
species = set([x.split("_")[1] for x in sample_names])
species_dict = collections.OrderedDict([])

## Ordered dict of sample names and their species, in same order
## as the vcf file
for s in sample_names:
    species_dict[s] = s.split("_")[1]
print(species_dict)

## Map species names to groups of individuals
#for s in species:
#    species_dict[s] = [x for x in sample_names if s in x]
#print(species_dict)
species_colors = {
    'rex': '#FF0000',
    'cyathophylla': '#008000',
    'thamno': '#00FFFF',
    'cyathophylloides': '#90EE90',
    'przewalskii': '#FFA500',
    'superba': '#8B0000',
}

OrderedDict([('29154_superba', 'superba'), ('30556_thamno', 'thamno'), ('30686_cyathophylla', 'cyathophylla'), ('32082_przewalskii', 'przewalskii'), ('33413_thamno', 'thamno'), ('33588_przewalskii', 'przewalskii'), ('35236_rex', 'rex'), ('35855_rex', 'rex'), ('38362_rex', 'rex'), ('39618_rex', 'rex'), ('40578_rex', 'rex'), ('41478_cyathophylloides', 'cyathophylloides'), ('41954_cyathophylloides', 'cyathophylloides')])

Function for plotting PCA given an input vcf file¶

In [4]:

def plotPCA(call_data, title):
    c = call_data
    g = allel.GenotypeArray(c.genotype)
    ac = g.count_alleles()
    ## Filter singletons and multi-allelic snps
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
    gf = g.compress(flt, axis=0)
    gn = gf.to_n_alt()
    coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(1, 1, 1)
    sns.despine(ax=ax, offset=5)
    x = coords1[:, 0]
    y = coords1[:, 1]
    
    ## We know this works because the species_dict and the columns in the vcf
    ## are in the same order. 
    for sp in species:
        flt = (np.array(species_dict.values()) == sp)
        ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
    ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
    ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
    fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold')
    fig.tight_layout()

def getPCA(call_data):
    c = call_data
    g = allel.GenotypeArray(c.genotype)
    ac = g.count_alleles()
    ## Filter singletons and multi-allelic snps
    flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1)
    gf = g.compress(flt, axis=0)
    gn = gf.to_n_alt()
    coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson')
    return coords1, model1
"""
    fig = plt.figure(figsize=(5, 5))
    x = coords1[:, 0]
    y = coords1[:, 1]
    
    ## We know this works because the species_dict and the columns in the vcf
    ## are in the same order. 
#    for sp in species:
#        flt = (np.array(species_dict.values()) == sp)
#        ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5)
    ax.plot(x, y, marker='o', markersize=10)
    ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
    ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))
"""

def plotPairwiseDistance(call_data, title):
    c = call_data
    #c = vcfnp.calldata_2d(filename).view(np.recarray)
    g = allel.GenotypeArray(c.genotype)
    gn = g.to_n_alt()
    dist = allel.stats.pairwise_distance(gn, metric='euclidean')
    allel.plot.pairwise_distance(dist, labels=species_dict.keys())

def getDistances(call_data):
    c = call_data
    #c = vcfnp.calldata_2d(filename).view(np.recarray)
    g = allel.GenotypeArray(c.genotype)
    gn = g.to_n_alt()
    dist = allel.stats.pairwise_distance(gn, metric='euclidean')
    return(dist)

Function for plotting distribution of variable sites across loci¶

In [5]:

## Inputs to this function are two Counters where keys
## are the base position and values are snp counts.
## The counter doesn't have to be sorted because we sort internally.
def SNP_position_plot(distvar, distpis):
    
    ## The last position to consider
    maxend = max(distvar.keys())
    
    ## This does two things, first it sorts in increasing
    ## order. Second, it creates a count bin for any position
    ## without snps and sets the count to 0.
    distvar = [distvar[x] for x in xrange(maxend)]
    distpis = [distpis[x] for x in xrange(maxend)]

    ## set color theme
    colormap = toyplot.color.Palette()

    ## make a canvas
    canvas = toyplot.Canvas(width=800, height=300)

    ## make axes
    axes = canvas.cartesian(xlabel="Position along RAD loci",
                       ylabel="N variables sites",
                       gutter=65)
    ## x-axis
    axes.x.ticks.show = True
    axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"}
    axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"}
    axes.x.ticks.below = 5
    axes.x.ticks.above = 0
    axes.x.domain.max = maxend
    axes.x.ticks.locator = toyplot.locator.Explicit(
        range(0, maxend, 5), 
        map(str, range(0, maxend, 5)))
    
    ## y-axis
    axes.y.ticks.show=True
    axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"}
    axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"}
    axes.y.ticks.below = 0
    axes.y.ticks.above = 5

    ## add fill plots
    x = np.arange(0, maxend)
    f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites")
    f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites")

    ## add a horizontal dashed line at the median Nsnps per site
    axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"})
    axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"})
    
    return canvas, axes

Functions for polling stats from vcf call data¶

In [6]:

import numpy_indexed as npi
    
## Get the number of samples with data at each snp
def snp_coverage(call_data):
    snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."])
    ## Fill zero values
    return [snp_counts[x] for x in xrange(1, max(snp_counts.keys())+1)]

## Get the number of samples with data at each locus
def loci_coverage(var_data, call_data, assembler):
    if "stacks" in assembler:
        loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./."))
    else:
        loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./."))
    counts_per_snp = []
    for z in xrange(0, len(loci)):
        counts_per_snp.append([x.sum() for x in loci[z][1]])
    counts = collections.Counter([np.max(x) for x in counts_per_snp])
    
    ## Fill all zero values
    return [counts[x] for x in xrange(1, max(counts.keys())+1)]

## Get total number of snps per sample
def sample_nsnps(call_data):
    return [x.sum() for x in call_data["GT"].T != "./."]

## Get total number of loci per sample
def sample_nloci(var_data, call_data, assembler):
    if "stacks" in assembler:
        locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")
    else:
        locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.")
        
    by_locus = [x.T for x in locus_groups[1]]
    by_sample = np.array([(x).any(axis=1) for x in by_locus])
    return [x.sum() for x in by_sample.T]

End housekeeping. Begin actual analysis of results.¶

First lets look at results just from the vcf files (so this is only looking at variable loci).¶

The first thing we'll do is create a dataframe for storing a bunch of coverage information from the runs for each method.

In [7]:

## Make a new pandas dataframe for holding the coverage results
## this is going to 
sim_vcf_dict = {}
for sim in ["simno", "simlo", "simhi", "simlarge"]:
    sim_vcf_dict["ipyrad-"+sim] = os.path.join(IPYRAD_DIR, "SIMDATA/{}/{}_outfiles/{}.vcf".format(sim, sim, sim))
    sim_vcf_dict["pyrad-"+sim] = os.path.join(PYRAD_DIR, "SIMDATA/{}/outfiles/c85d6m2p3H3N3.vcf".format(sim))
    sim_vcf_dict["stacks_ungapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/ungapped/{}/batch_1.vcf".format(sim))
    sim_vcf_dict["stacks_gapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/gapped/{}/batch_1.vcf".format(sim))
    sim_vcf_dict["stacks_defualt-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/default/{}/batch_1.vcf".format(sim))
    sim_vcf_dict["aftrrad-"+sim] = os.path.join(AFTRRAD_DIR, "SIMDATA/{}/Formatting/{}.vcf".format(sim, sim))
    sim_vcf_dict["ddocent_full-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/TotalRawSNPs.vcf".format(sim, sim))
    sim_vcf_dict["ddocent_filt-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/Final.recode.vcf".format(sim, sim))

Clean up the vcf data¶

Each program writes out a vcf with some little quirks, so we have to smooth them all out. gunzip and filter out all non-biallelic loci. You only need to run this cell once, if you are reanalyzing the simdata.

In [37]:

for k, f in sim_vcf_dict.items():
    ## If it's gzipped then unzip it (only applies to ipyrad)
    if os.path.exists(f + ".gz"):
        print("gunzipping - {}".format(f+".gz"))
        cmd = "gunzip -c {}.gz > {}".format(f, f)
        !$cmd
        f = f.split(".gz")[0]
        sim_vcf_dict[k] = f

    if os.path.exists(f):
        print("found - {}".format(f))
        if "stacks" in k:
            ## The version of vcftools we have here doesn't like the newer version of vcf stacks
            ## writes, so we have to 'fix' the stacks vcf version
            shutil.copy2(f, f+".bak")
            with open(f) as infile:
                dat = infile.readlines()[1:]
            with open(f, 'w') as outfile:
                outfile.write("##fileformat=VCFv4.1\n")
                outfile.write("".join(dat))

## Actually don't do this, it removes information about the assembly. 
## I don't know why i wanted to do this in the first place. Habit?
##        try:
##            ## Remove all but biallelic. 
##            outfile = f.split(".vcf")[0]+"-biallelic"
##            cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \
##                .format(DDOCENT_DIR, f, outfile)
##            #print(cmd)
##            os.system(cmd)
##            
##            ## update the vcf_dict
##            sim_vcf_dict[k] = outfile + ".recode.vcf"
##        except Exception as inst:
##            print(inst)
    else:
        print("not found - {}".format(f))

found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf
found - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf

Pull depth and coverage stats out of the vcf files¶

Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later).

In [38]:

import collections
sim_loc_cov = collections.OrderedDict()
sim_snp_cov = collections.OrderedDict()
sim_sample_nsnps = collections.OrderedDict()
sim_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
    try:
        print("Doing - {}".format(prog))
        print("\t{}".format(filename))
        v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray)
        c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray)

        sim_snp_cov[prog] = snp_coverage(c)
        sim_sample_nsnps[prog] = sample_nsnps(c)
        ## aftrRAD doesn't retain the kind of info we'd need for the next two
        if "aftrrad" in prog:
            continue
        sim_loc_cov[prog] = loci_coverage(v, c, prog)
        sim_sample_nlocs[prog] = sample_nloci(v, c, prog)
    except Exception as inst:
        print(inst)

[vcfnp] 2016-11-27 16:53:46.063946 :: caching is disabled
[vcfnp] 2016-11-27 16:53:46.064616 :: building array

Doing - ipyrad-simlarge
	/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:53:50.046048 :: caching is disabled
[vcfnp] 2016-11-27 16:53:50.046543 :: building array
[vcfnp] 2016-11-27 16:54:20.066691 :: caching is disabled
[vcfnp] 2016-11-27 16:54:20.067386 :: building array

Doing - pyrad-simhi
	/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:54:20.493348 :: caching is disabled
[vcfnp] 2016-11-27 16:54:20.493821 :: building array
[vcfnp] 2016-11-27 16:54:23.319907 :: caching is disabled
[vcfnp] 2016-11-27 16:54:23.320572 :: building array

Doing - pyrad-simlo
	/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:54:23.765458 :: caching is disabled
[vcfnp] 2016-11-27 16:54:23.765922 :: building array
[vcfnp] 2016-11-27 16:54:26.717321 :: caching is disabled
[vcfnp] 2016-11-27 16:54:26.717919 :: building array

Doing - ddocent_full-simlarge
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:54:38.023720 :: caching is disabled
[vcfnp] 2016-11-27 16:54:38.024183 :: building array
[vcfnp] 2016-11-27 16:55:53.857176 :: caching is disabled
[vcfnp] 2016-11-27 16:55:53.857903 :: building array

Doing - ddocent_filt-simlo
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:55:55.048663 :: caching is disabled
[vcfnp] 2016-11-27 16:55:55.049266 :: building array
[vcfnp] 2016-11-27 16:56:02.929772 :: caching is disabled
[vcfnp] 2016-11-27 16:56:02.930457 :: building array

Doing - stacks_gapped-simno
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:56:03.477877 :: caching is disabled
[vcfnp] 2016-11-27 16:56:03.478366 :: building array
[vcfnp] 2016-11-27 16:56:08.162218 :: caching is disabled
[vcfnp] 2016-11-27 16:56:08.162971 :: building array

Doing - stacks_ungapped-simlarge
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:56:13.256345 :: caching is disabled
[vcfnp] 2016-11-27 16:56:13.256817 :: building array
[vcfnp] 2016-11-27 16:57:00.367378 :: caching is disabled
[vcfnp] 2016-11-27 16:57:00.368017 :: building array

Doing - aftrrad-simno
	/home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:57:00.673033 :: caching is disabled
[vcfnp] 2016-11-27 16:57:00.673508 :: building array
[vcfnp] 2016-11-27 16:57:02.487800 :: caching is disabled
[vcfnp] 2016-11-27 16:57:02.488378 :: building array

Doing - aftrrad-simlo
	/home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:57:02.816922 :: caching is disabled
[vcfnp] 2016-11-27 16:57:02.817355 :: building array
[vcfnp] 2016-11-27 16:57:04.730942 :: caching is disabled
[vcfnp] 2016-11-27 16:57:04.731550 :: building array

Doing - ddocent_filt-simno
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:57:06.023200 :: caching is disabled
[vcfnp] 2016-11-27 16:57:06.023701 :: building array
[vcfnp] 2016-11-27 16:57:14.012106 :: caching is disabled
[vcfnp] 2016-11-27 16:57:14.012827 :: building array
[vcfnp] 2016-11-27 16:57:14.103613 :: caching is disabled
[vcfnp] 2016-11-27 16:57:14.104090 :: building array

Doing - stacks_gapped-simlo
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:57:14.906996 :: caching is disabled
[vcfnp] 2016-11-27 16:57:14.907457 :: building array

Doing - ipyrad-simlo
	/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:57:15.318753 :: caching is disabled
[vcfnp] 2016-11-27 16:57:15.319228 :: building array
[vcfnp] 2016-11-27 16:57:18.347953 :: caching is disabled
[vcfnp] 2016-11-27 16:57:18.348643 :: building array

Doing - stacks_defualt-simhi
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:57:18.757473 :: caching is disabled
[vcfnp] 2016-11-27 16:57:18.757896 :: building array
[vcfnp] 2016-11-27 16:57:22.446983 :: caching is disabled
[vcfnp] 2016-11-27 16:57:22.447555 :: building array

Doing - ddocent_filt-simlarge
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:57:33.454647 :: caching is disabled
[vcfnp] 2016-11-27 16:57:33.455088 :: building array
[vcfnp] 2016-11-27 16:58:47.221461 :: caching is disabled
[vcfnp] 2016-11-27 16:58:47.222295 :: building array
[vcfnp] 2016-11-27 16:58:47.238296 :: caching is disabled
[vcfnp] 2016-11-27 16:58:47.238777 :: building array
[vcfnp] 2016-11-27 16:58:47.295944 :: caching is disabled
[vcfnp] 2016-11-27 16:58:47.296469 :: building array

Doing - stacks_gapped-simhi
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf
Doing - ipyrad-simno
	/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:58:47.749539 :: caching is disabled
[vcfnp] 2016-11-27 16:58:47.750019 :: building array
[vcfnp] 2016-11-27 16:58:50.666550 :: caching is disabled
[vcfnp] 2016-11-27 16:58:50.667183 :: building array

Doing - aftrrad-simlarge
	/home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:58:52.924963 :: caching is disabled
[vcfnp] 2016-11-27 16:58:52.925433 :: building array
[vcfnp] 2016-11-27 16:59:07.577632 :: caching is disabled
[vcfnp] 2016-11-27 16:59:07.578327 :: building array

Doing - stacks_ungapped-simhi
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:59:08.104825 :: caching is disabled
[vcfnp] 2016-11-27 16:59:08.105282 :: building array
[vcfnp] 2016-11-27 16:59:12.857512 :: caching is disabled
[vcfnp] 2016-11-27 16:59:12.858353 :: building array

Doing - aftrrad-simhi
	/home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:59:13.149329 :: caching is disabled
[vcfnp] 2016-11-27 16:59:13.149779 :: building array
[vcfnp] 2016-11-27 16:59:14.874231 :: caching is disabled
[vcfnp] 2016-11-27 16:59:14.875006 :: building array

Doing - stacks_ungapped-simno
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:59:15.416360 :: caching is disabled
[vcfnp] 2016-11-27 16:59:15.416977 :: building array
[vcfnp] 2016-11-27 16:59:20.232799 :: caching is disabled
[vcfnp] 2016-11-27 16:59:20.233563 :: building array

Doing - stacks_defualt-simlarge
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 16:59:24.182469 :: caching is disabled
[vcfnp] 2016-11-27 16:59:24.183060 :: building array
[vcfnp] 2016-11-27 17:00:01.487897 :: caching is disabled
[vcfnp] 2016-11-27 17:00:01.488714 :: building array

Doing - stacks_ungapped-simlo
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:00:02.032021 :: caching is disabled
[vcfnp] 2016-11-27 17:00:02.032495 :: building array
[vcfnp] 2016-11-27 17:00:07.044474 :: caching is disabled
[vcfnp] 2016-11-27 17:00:07.045104 :: building array

Doing - ipyrad-simhi
	/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:00:07.802941 :: caching is disabled
[vcfnp] 2016-11-27 17:00:07.803469 :: building array
[vcfnp] 2016-11-27 17:00:10.822673 :: caching is disabled
[vcfnp] 2016-11-27 17:00:10.823271 :: building array

Doing - pyrad-simno
	/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:00:11.291555 :: caching is disabled
[vcfnp] 2016-11-27 17:00:11.292032 :: building array
[vcfnp] 2016-11-27 17:00:14.220085 :: caching is disabled
[vcfnp] 2016-11-27 17:00:14.220821 :: building array

Doing - ddocent_full-simhi
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:00:15.503526 :: caching is disabled
[vcfnp] 2016-11-27 17:00:15.503993 :: building array
[vcfnp] 2016-11-27 17:00:23.207787 :: caching is disabled
[vcfnp] 2016-11-27 17:00:23.208546 :: building array

Doing - stacks_defualt-simno
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:00:23.640669 :: caching is disabled
[vcfnp] 2016-11-27 17:00:23.641165 :: building array
[vcfnp] 2016-11-27 17:00:27.520616 :: caching is disabled
[vcfnp] 2016-11-27 17:00:27.521816 :: building array

Doing - pyrad-simlarge
	/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:00:31.749149 :: caching is disabled
[vcfnp] 2016-11-27 17:00:31.749765 :: building array
[vcfnp] 2016-11-27 17:00:59.712859 :: caching is disabled
[vcfnp] 2016-11-27 17:00:59.713690 :: building array

Doing - ddocent_full-simlo
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:01:00.898081 :: caching is disabled
[vcfnp] 2016-11-27 17:01:00.898895 :: building array
[vcfnp] 2016-11-27 17:01:08.917655 :: caching is disabled
[vcfnp] 2016-11-27 17:01:08.918244 :: building array

Doing - stacks_gapped-simlarge
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:01:14.105084 :: caching is disabled
[vcfnp] 2016-11-27 17:01:14.105781 :: building array
[vcfnp] 2016-11-27 17:01:59.933549 :: caching is disabled
[vcfnp] 2016-11-27 17:01:59.934387 :: building array

Doing - stacks_defualt-simlo
	/home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:02:00.344161 :: caching is disabled
[vcfnp] 2016-11-27 17:02:00.344609 :: building array
[vcfnp] 2016-11-27 17:02:04.000344 :: caching is disabled
[vcfnp] 2016-11-27 17:02:04.000860 :: building array

Doing - ddocent_filt-simhi
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:02:05.309855 :: caching is disabled
[vcfnp] 2016-11-27 17:02:05.310451 :: building array
[vcfnp] 2016-11-27 17:02:13.334594 :: caching is disabled
[vcfnp] 2016-11-27 17:02:13.335213 :: building array

Doing - ddocent_full-simno
	/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf

[vcfnp] 2016-11-27 17:02:14.594919 :: caching is disabled
[vcfnp] 2016-11-27 17:02:14.595400 :: building array

Write out the results to a file¶

In case we need to restart the notebook then we won't have to come back and reload all the data

In [496]:

res_dict = {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
            "sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}

res = WORK_DIR + "RESULTS/"
if not os.path.exists(res):
    os.mkdir(res)

for k,v in res_dict.items():
    df = pd.DataFrame(v)
    df.to_csv(res + k + ".csv")

Reload the data from csv files¶

If you ever need to do this.

In [497]:

for k,v in res_dict.items():
    df = pd.read_csv(res + k + ".csv")
    print(df)

    Unnamed: 0  ipyrad-simlarge  pyrad-simhi  pyrad-simlo  \
0            0                0            0            0   
1            1               12            0            0   
2            2               35            0            0   
3            3              436            0            0   
4            4              137            0            0   
5            5               70            2            0   
6            6              256            0            0   
7            7             1737            0            0   
8            8             1688            0            0   
9            9             2327            0            0   
10          10             9920            1            0   
11          11            81527         9875         9884   

    ddocent_full-simlarge  ddocent_filt-simlo  stacks_gapped-simno  \
0                       1                   0                    0   
1                       2                   0                    0   
2                      38                   0                    0   
3                     433                   0                    0   
4                     139                   0                    0   
5                      70                   0                    0   
6                     258                   0                    0   
7                    1732                   0                    0   
8                    1689                   0                    0   
9                    2322                   0                    5   
10                   9808                   0                  329   
11                  81397                9853                 9559   

    stacks_ungapped-simlarge  ddocent_filt-simno  stacks_gapped-simlo  \
0                          5                   0                    0   
1                         14                   0                    0   
2                         42                   0                    0   
3                        440                   0                    0   
4                        140                   0                    0   
5                         82                   0                    0   
6                        296                   0                    0   
7                       1743                   0                    1   
8                       1729                   0                    0   
9                       2627                   0                    2   
10                     12400                   0                   57   
11                     78902                9852                 1452   

           ...          ipyrad-simhi  pyrad-simno  ddocent_full-simhi  \
0          ...                     0            0                   0   
1          ...                     0            0                   0   
2          ...                     0            0                   0   
3          ...                     1            0                   2   
4          ...                     0            0                   0   
5          ...                     0            0                   0   
6          ...                     0            0                   0   
7          ...                     1            0                   0   
8          ...                     0            0                   1   
9          ...                     0            0                   0   
10         ...                     1            0                   4   
11         ...                  9842         9894                9844   

    stacks_defualt-simno  pyrad-simlarge  ddocent_full-simlo  \
0                    350               0                   0   
1                    259              13                   0   
2                    365              38                   0   
3                    528             446                   0   
4                     68             141                   0   
5                     60              72                   0   
6                    153             258                   0   
7                    668            1748                   0   
8                    315            1704                   0   
9                    338            2342                   0   
10                  1311            9952                   0   
11                  6352           81709                9853   

    stacks_gapped-simlarge  stacks_defualt-simlo  ddocent_filt-simhi  \
0                        5                   356                   0   
1                       14                   278                   0   
2                       42                   386                   0   
3                      440                   532                   0   
4                      140                    74                   0   
5                       82                    64                   0   
6                      296                   178                   0   
7                     1743                   690                   0   
8                     1729                   332                   0   
9                     2627                   371                   0   
10                   12400                  1447                   4   
11                   78902                  6105                9844   

    ddocent_full-simno  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  
6                    0  
7                    0  
8                    0  
9                    0  
10                   0  
11                9852  

[12 rows x 29 columns]
    Unnamed: 0  ipyrad-simlarge  pyrad-simhi  pyrad-simlo  \
0            0            95399         9877         9884   
1            1            95428         9877         9884   
2            2            95405         9877         9884   
3            3            95456         9877         9884   
4            4            95450         9877         9884   
5            5            95412         9876         9884   
6            6            95437         9877         9884   
7            7            95418         9877         9884   
8            8            95283         9877         9884   
9            9            95294         9877         9884   
10          10            95288         9877         9884   
11          11            95302         9877         9884   

    ddocent_full-simlarge  ddocent_filt-simlo  stacks_gapped-simno  \
0                   95169                9853                 9870   
1                   95200                9853                 9862   
2                   95167                9853                 9865   
3                   95218                9853                 9863   
4                   95203                9853                 9862   
5                   95164                9853                 9864   
6                   95198                9853                 9864   
7                   95177                9853                 9867   
8                   95048                9853                 9865   
9                   95056                9853                 9867   
10                  95049                9853                 9863   
11                  95052                9853                 9865   

    stacks_ungapped-simlarge  ddocent_filt-simno  stacks_gapped-simlo  \
0                      95366                9852                 1512   
1                      95401                9852                 1503   
2                      95391                9852                 1507   
3                      95400                9852                 1503   
4                      95394                9852                 1507   
5                      95367                9852                 1504   
6                      95411                9852                 1508   
7                      95405                9852                 1508   
8                      95232                9852                 1507   
9                      95276                9852                 1506   
10                     95268                9852                 1508   
11                     95271                9852                 1506   

           ...          ipyrad-simhi  pyrad-simno  ddocent_full-simhi  \
0          ...                  9844         9894                9848   
1          ...                  9844         9894                9848   
2          ...                  9844         9894                9848   
3          ...                  9844         9894                9850   
4          ...                  9844         9894                9850   
5          ...                  9843         9894                9850   
6          ...                  9844         9894                9848   
7          ...                  9844         9894                9848   
8          ...                  9844         9894                9849   
9          ...                  9844         9894                9850   
10         ...                  9844         9894                9850   
11         ...                  9844         9894                9850   

    stacks_defualt-simno  pyrad-simlarge  ddocent_full-simlo  \
0                   9320           95646                9853   
1                   9303           95675                9853   
2                   9253           95649                9853   
3                   9096           95708                9853   
4                   9042           95695                9853   
5                   9021           95659                9853   
6                   8958           95686                9853   
7                   8934           95672                9853   
8                   8846           95543                9853   
9                   8823           95552                9853   
10                  8767           95544                9853   
11                  8687           95558                9853   

    stacks_gapped-simlarge  stacks_defualt-simlo  ddocent_filt-simhi  \
0                    95366                  9275                9848   
1                    95401                  9254                9848   
2                    95391                  9207                9848   
3                    95400                  9055                9848   
4                    95394                  9020                9848   
5                    95367                  8993                9848   
6                    95411                  8929                9846   
7                    95405                  8888                9847   
8                    95232                  8807                9847   
9                    95276                  8779                9848   
10                   95268                  8741                9848   
11                   95271                  8645                9848   

    ddocent_full-simno  
0                 9852  
1                 9852  
2                 9852  
3                 9852  
4                 9852  
5                 9852  
6                 9852  
7                 9852  
8                 9852  
9                 9852  
10                9852  
11                9852  

[12 rows x 29 columns]
    Unnamed: 0  ipyrad-simlarge  pyrad-simhi  pyrad-simlo  \
0            0           399293        42421        43106   
1            1           399339        42421        43105   
2            2           399224        42417        43105   
3            3           399319        42423        43099   
4            4           399355        42416        43100   
5            5           399039        42421        43101   
6            6           399259        42421        43104   
7            7           399070        42421        43101   
8            8           398377        42427        43103   
9            9           398531        42423        43106   
10          10           398376        42425        43106   
11          11           398234        42423        43103   

    ddocent_full-simlarge  ddocent_filt-simlo  stacks_gapped-simno  \
0                  381136               39879                43704   
1                  381212               39877                43636   
2                  381074               39878                43669   
3                  381093               39875                43674   
4                  381176               39879                43644   
5                  380907               39876                43677   
6                  381106               39879                43668   
7                  380897               39876                43692   
8                  380142               39879                43668   
9                  380310               39877                43694   
10                 380280               39879                43676   
11                 380121               39875                43692   

    stacks_ungapped-simlarge  aftrrad-simno  aftrrad-simlo  \
0                     416856          40658          40007   
1                     416936          40658          40007   
2                     416890          40658          40007   
3                     416846          40658          40007   
4                     416861          40658          40007   
5                     416524          40658          40007   
6                     416844          40658          40007   
7                     416795          40658          40007   
8                     415751          40658          40007   
9                     416064          40658          40007   
10                    415945          40658          40007   
11                    415788          40658          40007   

           ...          ipyrad-simhi  pyrad-simno  ddocent_full-simhi  \
0          ...                 40402        43855               40041   
1          ...                 40402        43854               40045   
2          ...                 40402        43855               40041   
3          ...                 40402        43855               40049   
4          ...                 40402        43855               40054   
5          ...                 40399        43855               40057   
6          ...                 40402        43855               40043   
7          ...                 40402        43855               40041   
8          ...                 40400        43854               40053   
9          ...                 40400        43855               40060   
10         ...                 40400        43854               40054   
11         ...                 40400        43855               40054   

    stacks_defualt-simno  pyrad-simlarge  ddocent_full-simlo  \
0                  32242          418730               39889   
1                  32199          418786               39887   
2                  32099          418656               39888   
3                  31740          418768               39887   
4                  31694          418782               39890   
5                  31644          418459               39889   
6                  31502          418659               39888   
7                  31443          418481               39887   
8                  30932          417726               39898   
9                  30922          417896               39893   
10                 30868          417747               39893   
11                 30683          417623               39888   

    stacks_gapped-simlarge  stacks_defualt-simlo  ddocent_filt-simhi  \
0                   416856                 31446               40008   
1                   416936                 31372               40010   
2                   416890                 31298               40003   
3                   416846                 30969               40003   
4                   416861                 30944               40008   
5                   416524                 30881               40008   
6                   416844                 30741               40002   
7                   416795                 30662               40001   
8                   415751                 30184               40005   
9                   416064                 30149               40010   
10                  415945                 30129               40004   
11                  415788                 29904               40002   

    ddocent_full-simno  
0                39981  
1                39980  
2                39980  
3                39980  
4                39980  
5                39979  
6                39980  
7                39977  
8                39981  
9                39979  
10               39980  
11               39980  

[12 rows x 33 columns]
    Unnamed: 0  ipyrad-simlarge  pyrad-simhi  pyrad-simlo  \
0            0                0            1            1   
1            1               22           10            0   
2            2               46            6            1   
3            3              786            6            1   
4            4              305            1            1   
5            5              175           14            0   
6            6              721            0            0   
7            7             5301           10            2   
8            8             5886            7            1   
9            9             8687            6            2   
10          10            39920           60           19   
11          11           346258        42337        43081   

    ddocent_full-simlarge  ddocent_filt-simlo  stacks_gapped-simno  \
0                       3                   0                    0   
1                       3                   0                    0   
2                      53                   0                    0   
3                     758                   0                    0   
4                     299                   0                    0   
5                     164                   0                    0   
6                     696                   0                    0   
7                    5106                   0                    0   
8                    5745                   0                    0   
9                    8397                   0                   24   
10                  37784                  19                 1362   
11                 330563               39860                42406   

    stacks_ungapped-simlarge  aftrrad-simno  aftrrad-simlo  \
0                          6              0              0   
1                         25              0              0   
2                         56              0              0   
3                        819              0              0   
4                        317              0              0   
5                        201              0              0   
6                        880              0              0   
7                       5515              0              0   
8                       6288              0              0   
9                      10290              0              0   
10                     52091              0              0   
11                    350753          40658          40007   

           ...          ipyrad-simhi  pyrad-simno  ddocent_full-simhi  \
0          ...                     0            0                  25   
1          ...                     0            0                   3   
2          ...                     0            0                   7   
3          ...                     3            0                  11   
4          ...                     0            0                   2   
5          ...                     0            0                   2   
6          ...                     0            0                   0   
7          ...                     5            0                   8   
8          ...                     0            0                  20   
9          ...                     0            0                   7   
10         ...                     3            3                  80   
11         ...                 40394        43852               39940   

    stacks_defualt-simno  pyrad-simlarge  ddocent_full-simlo  \
0                    469               0                  11   
1                    327              24                   0   
2                    520              50                   1   
3                    840             827                   2   
4                    128             319                   2   
5                    134             180                   0   
6                    348             761                   0   
7                   1715            5535                   2   
8                    920            6183                   2   
9                   1063            9158                   1   
10                  4497           41802                  19   
11                 23829          363123               39866   

    stacks_gapped-simlarge  stacks_defualt-simlo  ddocent_filt-simhi  \
0                        6                   472                   0   
1                       25                   347                   0   
2                       56                   539                   0   
3                      819                   833                   0   
4                      317                   143                   0   
5                      201                   134                   0   
6                      880                   397                   0   
7                     5515                  1754                   0   
8                     6288                   957                   0   
9                    10290                  1155                   0   
10                   52091                  4861                  80   
11                  350753                 22550               39932   

    ddocent_full-simno  
0                    0  
1                    0  
2                    0  
3                    0  
4                    0  
5                    0  
6                    0  
7                    1  
8                    0  
9                    1  
10                   9  
11               39970  

[12 rows x 33 columns]

Print out the results¶

This isn't very helpful, but you get an idea of what's going on. Values here for stacks gapped analysis look weird because they are bad. The populations program was segfaulting during creation of the vcf, so it's super truncated. Not sure what the problem is. The results down below the plots pull in stats from other files so they are better.

In [39]:

for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\
             "sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items():

    for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
        for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
            try:
                print(prog+sim + " " + statname + "\t"),
                print(stat[prog + sim]),
                print(np.mean(stat[prog + sim]))
            except:
                print("No {} stats for {}".format(statname, prog + sim))
        print("------------------------------------------------------")

ipyrad-simno sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9870] 822.5
pyrad-simno sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9894] 824.5
stacks_ungapped-simno sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 329, 9559] 824.416666667
stacks_gapped-simno sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 329, 9559] 824.416666667
stacks_defualt-simno sim_loc_cov	[350, 259, 365, 528, 68, 60, 153, 668, 315, 338, 1311, 6352] 897.25
ddocent_filt-simno sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9852] 821.0
ddocent_full-simno sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9852] 821.0
aftrrad-simno sim_loc_cov	No sim_loc_cov stats for aftrrad-simno
------------------------------------------------------
ipyrad-simlo sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9859] 821.583333333
pyrad-simlo sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9884] 823.666666667
stacks_ungapped-simlo sim_loc_cov	[24, 30, 31, 38, 6, 2, 8, 48, 48, 66, 573, 9131] 833.75
stacks_gapped-simlo sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 57, 1452] 126.0
stacks_defualt-simlo sim_loc_cov	[356, 278, 386, 532, 74, 64, 178, 690, 332, 371, 1447, 6105] 901.083333333
ddocent_filt-simlo sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9853] 821.083333333
ddocent_full-simlo sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9853] 821.083333333
aftrrad-simlo sim_loc_cov	No sim_loc_cov stats for aftrrad-simlo
------------------------------------------------------
ipyrad-simhi sim_loc_cov	[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9842] 820.416666667
pyrad-simhi sim_loc_cov	[0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9875] 823.166666667
stacks_ungapped-simhi sim_loc_cov	[54, 59, 66, 96, 17, 12, 18, 138, 106, 132, 943, 8509] 845.833333333
stacks_gapped-simhi sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 56] 5.0
stacks_defualt-simhi sim_loc_cov	[369, 288, 402, 556, 86, 78, 209, 714, 367, 444, 1614, 5738] 905.416666667
ddocent_filt-simhi sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9844] 820.666666667
ddocent_full-simhi sim_loc_cov	[0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 4, 9844] 820.916666667
aftrrad-simhi sim_loc_cov	No sim_loc_cov stats for aftrrad-simhi
------------------------------------------------------
ipyrad-simlarge sim_loc_cov	[0, 12, 35, 436, 137, 70, 256, 1737, 1688, 2327, 9920, 81527] 8178.75
pyrad-simlarge sim_loc_cov	[0, 13, 38, 446, 141, 72, 258, 1748, 1704, 2342, 9952, 81709] 8201.91666667
stacks_ungapped-simlarge sim_loc_cov	[5, 14, 42, 440, 140, 82, 296, 1743, 1729, 2627, 12400, 78902] 8201.66666667
stacks_gapped-simlarge sim_loc_cov	[5, 14, 42, 440, 140, 82, 296, 1743, 1729, 2627, 12400, 78902] 8201.66666667
stacks_defualt-simlarge sim_loc_cov	[3666, 2729, 3623, 5435, 913, 878, 2294, 7764, 4321, 5593, 17308, 52404] 8910.66666667
ddocent_filt-simlarge sim_loc_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9808, 81396] 7600.33333333
ddocent_full-simlarge sim_loc_cov	[1, 2, 38, 433, 139, 70, 258, 1732, 1689, 2322, 9808, 81397] 8157.41666667
aftrrad-simlarge sim_loc_cov	No sim_loc_cov stats for aftrrad-simlarge
------------------------------------------------------
ipyrad-simno sim_sample_nlocs	[9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870] 9870.0
pyrad-simno sim_sample_nlocs	[9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894] 9894.0
stacks_ungapped-simno sim_sample_nlocs	[9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75
stacks_gapped-simno sim_sample_nlocs	[9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75
stacks_defualt-simno sim_sample_nlocs	[9320, 9303, 9253, 9096, 9042, 9021, 8958, 8934, 8846, 8823, 8767, 8687] 9004.16666667
ddocent_filt-simno sim_sample_nlocs	[9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0
ddocent_full-simno sim_sample_nlocs	[9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0
aftrrad-simno sim_sample_nlocs	No sim_sample_nlocs stats for aftrrad-simno
------------------------------------------------------
ipyrad-simlo sim_sample_nlocs	[9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859] 9859.0
pyrad-simlo sim_sample_nlocs	[9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884] 9884.0
stacks_ungapped-simlo sim_sample_nlocs	[9819, 9808, 9818, 9812, 9832, 9825, 9826, 9801, 9811, 9808, 9817, 9801] 9814.83333333
stacks_gapped-simlo sim_sample_nlocs	[1512, 1503, 1507, 1503, 1507, 1504, 1508, 1508, 1507, 1506, 1508, 1506] 1506.58333333
stacks_defualt-simlo sim_sample_nlocs	[9275, 9254, 9207, 9055, 9020, 8993, 8929, 8888, 8807, 8779, 8741, 8645] 8966.08333333
ddocent_filt-simlo sim_sample_nlocs	[9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0
ddocent_full-simlo sim_sample_nlocs	[9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0
aftrrad-simlo sim_sample_nlocs	No sim_sample_nlocs stats for aftrrad-simlo
------------------------------------------------------
ipyrad-simhi sim_sample_nlocs	[9844, 9844, 9844, 9844, 9844, 9843, 9844, 9844, 9844, 9844, 9844, 9844] 9843.91666667
pyrad-simhi sim_sample_nlocs	[9877, 9877, 9877, 9877, 9877, 9876, 9877, 9877, 9877, 9877, 9877, 9877] 9876.91666667
stacks_ungapped-simhi sim_sample_nlocs	[9741, 9769, 9746, 9746, 9756, 9747, 9756, 9745, 9727, 9731, 9717, 9715] 9741.33333333
stacks_gapped-simhi sim_sample_nlocs	[60, 60, 59, 60, 60, 60, 60, 60, 60, 60, 59, 58] 59.6666666667
stacks_defualt-simhi sim_sample_nlocs	[9187, 9208, 9138, 8985, 8955, 8925, 8878, 8842, 8737, 8716, 8648, 8582] 8900.08333333
ddocent_filt-simhi sim_sample_nlocs	[9848, 9848, 9848, 9848, 9848, 9848, 9846, 9847, 9847, 9848, 9848, 9848] 9847.66666667
ddocent_full-simhi sim_sample_nlocs	[9848, 9848, 9848, 9850, 9850, 9850, 9848, 9848, 9849, 9850, 9850, 9850] 9849.08333333
aftrrad-simhi sim_sample_nlocs	No sim_sample_nlocs stats for aftrrad-simhi
------------------------------------------------------
ipyrad-simlarge sim_sample_nlocs	[95399, 95428, 95405, 95456, 95450, 95412, 95437, 95418, 95283, 95294, 95288, 95302] 95381.0
pyrad-simlarge sim_sample_nlocs	[95646, 95675, 95649, 95708, 95695, 95659, 95686, 95672, 95543, 95552, 95544, 95558] 95632.25
stacks_ungapped-simlarge sim_sample_nlocs	[95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5
stacks_gapped-simlarge sim_sample_nlocs	[95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5
stacks_defualt-simlarge sim_sample_nlocs	[90013, 89989, 89284, 87596, 87548, 87334, 86818, 86236, 85427, 85225, 84641, 83680] 86982.5833333
ddocent_filt-simlarge sim_sample_nlocs	[90677, 90706, 90329, 89908, 90701, 90653, 90336, 89898, 90677, 90684, 90271, 89800] 90386.6666667
ddocent_full-simlarge sim_sample_nlocs	[95169, 95200, 95167, 95218, 95203, 95164, 95198, 95177, 95048, 95056, 95049, 95052] 95141.75
aftrrad-simlarge sim_sample_nlocs	No sim_sample_nlocs stats for aftrrad-simlarge
------------------------------------------------------
ipyrad-simno sim_sample_nsnps	[41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893, 41893] 41893.0
pyrad-simno sim_sample_nsnps	[43855, 43854, 43855, 43855, 43855, 43855, 43855, 43855, 43854, 43855, 43854, 43855] 43854.75
stacks_ungapped-simno sim_sample_nsnps	[43704, 43636, 43669, 43674, 43644, 43677, 43668, 43692, 43668, 43694, 43676, 43692] 43674.5
stacks_gapped-simno sim_sample_nsnps	[43704, 43636, 43669, 43674, 43644, 43677, 43668, 43692, 43668, 43694, 43676, 43692] 43674.5
stacks_defualt-simno sim_sample_nsnps	[32242, 32199, 32099, 31740, 31694, 31644, 31502, 31443, 30932, 30922, 30868, 30683] 31497.3333333
ddocent_filt-simno sim_sample_nsnps	[39974, 39973, 39973, 39973, 39974, 39973, 39974, 39971, 39974, 39973, 39974, 39973] 39973.25
ddocent_full-simno sim_sample_nsnps	[39981, 39980, 39980, 39980, 39980, 39979, 39980, 39977, 39981, 39979, 39980, 39980] 39979.75
aftrrad-simno sim_sample_nsnps	[40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658, 40658] 40658.0
------------------------------------------------------
ipyrad-simlo sim_sample_nsnps	[41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126, 41126] 41126.0
pyrad-simlo sim_sample_nsnps	[43106, 43105, 43105, 43099, 43100, 43101, 43104, 43101, 43103, 43106, 43106, 43103] 43103.25
stacks_ungapped-simlo sim_sample_nsnps	[42878, 42825, 42865, 42829, 42921, 42896, 42910, 42819, 42825, 42861, 42865, 42806] 42858.3333333
stacks_gapped-simlo sim_sample_nsnps	[6664, 6634, 6642, 6619, 6645, 6637, 6645, 6646, 6646, 6649, 6646, 6638] 6642.58333333
stacks_defualt-simlo sim_sample_nsnps	[31446, 31372, 31298, 30969, 30944, 30881, 30741, 30662, 30184, 30149, 30129, 29904] 30723.25
ddocent_filt-simlo sim_sample_nsnps	[39879, 39877, 39878, 39875, 39879, 39876, 39879, 39876, 39879, 39877, 39879, 39875] 39877.4166667
ddocent_full-simlo sim_sample_nsnps	[39889, 39887, 39888, 39887, 39890, 39889, 39888, 39887, 39898, 39893, 39893, 39888] 39889.75
aftrrad-simlo sim_sample_nsnps	[40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007, 40007] 40007.0
------------------------------------------------------
ipyrad-simhi sim_sample_nsnps	[40402, 40402, 40402, 40402, 40402, 40399, 40402, 40402, 40400, 40400, 40400, 40400] 40401.0833333
pyrad-simhi sim_sample_nsnps	[42421, 42421, 42417, 42423, 42416, 42421, 42421, 42421, 42427, 42423, 42425, 42423] 42421.5833333
stacks_ungapped-simhi sim_sample_nsnps	[42007, 42161, 42001, 42096, 42063, 42034, 42104, 42068, 41959, 41942, 41970, 42002] 42033.9166667
stacks_gapped-simhi sim_sample_nsnps	[250, 250, 247, 250, 250, 250, 250, 250, 250, 250, 246, 243] 248.833333333
stacks_defualt-simhi sim_sample_nsnps	[30282, 30375, 30183, 29875, 29828, 29762, 29663, 29607, 29029, 28982, 28947, 28822] 29612.9166667
ddocent_filt-simhi sim_sample_nsnps	[40008, 40010, 40003, 40003, 40008, 40008, 40002, 40001, 40005, 40010, 40004, 40002] 40005.3333333
ddocent_full-simhi sim_sample_nsnps	[40041, 40045, 40041, 40049, 40054, 40057, 40043, 40041, 40053, 40060, 40054, 40054] 40049.3333333
aftrrad-simhi sim_sample_nsnps	[39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027, 39027] 39027.0
------------------------------------------------------
ipyrad-simlarge sim_sample_nsnps	[399293, 399339, 399224, 399319, 399355, 399039, 399259, 399070, 398377, 398531, 398376, 398234] 398951.333333
pyrad-simlarge sim_sample_nsnps	[418730, 418786, 418656, 418768, 418782, 418459, 418659, 418481, 417726, 417896, 417747, 417623] 418359.416667
stacks_ungapped-simlarge sim_sample_nsnps	[416856, 416936, 416890, 416846, 416861, 416524, 416844, 416795, 415751, 416064, 415945, 415788] 416508.333333
stacks_gapped-simlarge sim_sample_nsnps	[416856, 416936, 416890, 416846, 416861, 416524, 416844, 416795, 415751, 416064, 415945, 415788] 416508.333333
stacks_defualt-simlarge sim_sample_nsnps	[306434, 306316, 304795, 300499, 300565, 300050, 299270, 297832, 292556, 292370, 291639, 289438] 298480.333333
ddocent_filt-simlarge sim_sample_nsnps	[366270, 366351, 364928, 363394, 366352, 366085, 365000, 363294, 366128, 366304, 364722, 362908] 365144.666667
ddocent_full-simlarge sim_sample_nsnps	[381136, 381212, 381074, 381093, 381176, 380907, 381106, 380897, 380142, 380310, 380280, 380121] 380787.833333
aftrrad-simlarge sim_sample_nsnps	[336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112, 336112] 336112.0
------------------------------------------------------
ipyrad-simno sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41893] 3491.08333333
pyrad-simno sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 43852] 3654.58333333
stacks_ungapped-simno sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 1362, 42406] 3649.33333333
stacks_gapped-simno sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 1362, 42406] 3649.33333333
stacks_defualt-simno sim_snp_cov	[469, 327, 520, 840, 128, 134, 348, 1715, 920, 1063, 4497, 23829] 2899.16666667
ddocent_filt-simno sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 39965] 3331.16666667
ddocent_full-simno sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 9, 39970] 3331.75
aftrrad-simno sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40658] 3388.16666667
------------------------------------------------------
ipyrad-simlo sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 41126] 3427.16666667
pyrad-simlo sim_snp_cov	[1, 0, 1, 1, 1, 0, 0, 2, 1, 2, 19, 43081] 3592.41666667
stacks_ungapped-simlo sim_snp_cov	[28, 36, 42, 67, 12, 2, 20, 145, 150, 237, 2306, 40279] 3610.33333333
stacks_gapped-simlo sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 3, 0, 6, 233, 6422] 555.333333333
stacks_defualt-simlo sim_snp_cov	[472, 347, 539, 833, 143, 134, 397, 1754, 957, 1155, 4861, 22550] 2845.16666667
ddocent_filt-simlo sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 39860] 3323.25
ddocent_full-simlo sim_snp_cov	[11, 0, 1, 2, 2, 0, 0, 2, 2, 1, 19, 39866] 3325.5
aftrrad-simlo sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40007] 3333.91666667
------------------------------------------------------
ipyrad-simhi sim_snp_cov	[0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 3, 40394] 3367.08333333
pyrad-simhi sim_snp_cov	[1, 10, 6, 6, 1, 14, 0, 10, 7, 6, 60, 42337] 3538.16666667
stacks_ungapped-simhi sim_snp_cov	[58, 103, 112, 222, 54, 41, 50, 437, 403, 528, 3826, 37297] 3594.25
stacks_gapped-simhi sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 236] 20.8333333333
stacks_defualt-simhi sim_snp_cov	[477, 350, 564, 862, 169, 166, 462, 1802, 1037, 1398, 5298, 20663] 2770.66666667
ddocent_filt-simhi sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 80, 39932] 3334.33333333
ddocent_full-simhi sim_snp_cov	[25, 3, 7, 11, 2, 2, 0, 8, 20, 7, 80, 39940] 3342.08333333
aftrrad-simhi sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 39027] 3252.25
------------------------------------------------------
ipyrad-simlarge sim_snp_cov	[0, 22, 46, 786, 305, 175, 721, 5301, 5886, 8687, 39920, 346258] 34008.9166667
pyrad-simlarge sim_snp_cov	[0, 24, 50, 827, 319, 180, 761, 5535, 6183, 9158, 41802, 363123] 35663.5
stacks_ungapped-simlarge sim_snp_cov	[6, 25, 56, 819, 317, 201, 880, 5515, 6288, 10290, 52091, 350753] 35603.4166667
stacks_gapped-simlarge sim_snp_cov	[6, 25, 56, 819, 317, 201, 880, 5515, 6288, 10290, 52091, 350753] 35603.4166667
stacks_defualt-simlarge sim_snp_cov	[4983, 3452, 5104, 8857, 1725, 1881, 5371, 20413, 12908, 18084, 60319, 194817] 28159.5
ddocent_filt-simlarge sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37780, 330513] 30691.0833333
ddocent_full-simlarge sim_snp_cov	[3, 3, 53, 758, 299, 164, 696, 5106, 5745, 8397, 37784, 330563] 32464.25
aftrrad-simlarge sim_snp_cov	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 336112] 28009.3333333
------------------------------------------------------

Pairwise difference and PCA plots for each simulation treatment¶

In [61]:

## Load the calldata into a dict so we don't have to keep loading and reloading it
calldata = {}
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
    for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
                          "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
        print("{}".format(prog+sim)),
        print("{}".format(sim_vcf_dict[prog+sim]))
        c = vcfnp.calldata_2d(sim_vcf_dict[prog+sim], verbose=False).view(np.recarray)
        calldata[prog+sim] = c

ipyrad-simno /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf
pyrad-simno /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf
stacks_gapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf
stacks_defualt-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf
ddocent_filt-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simno /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf
ipyrad-simlo /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf
pyrad-simlo /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf
stacks_gapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf
stacks_defualt-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf
ddocent_filt-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simlo /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf
ipyrad-simhi /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf
pyrad-simhi /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf
stacks_gapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf
stacks_defualt-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf
ddocent_filt-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simhi /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf
ipyrad-simlarge /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf
pyrad-simlarge /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf
stacks_gapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf
stacks_defualt-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf
ddocent_filt-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simlarge /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf

In [617]:

pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
sim_sample_names = pop1 + pop2 + pop3
flt = np.in1d(np.array(sim_sample_names), pop1)
print(flt)

[ True  True  True  True False False False False False False False False]

Plot PCAs for each assembler and each simulated datatype¶

In [619]:

## Some housekeeping with sample names to make the PCA plots prettier
pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"]
pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"]
pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"]
pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3}
pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"}
sim_sample_names = pop1 + pop2 + pop3

for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
#for sim in ["-simhi"]:
    print("Doing - {}".format(sim))
    f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000)
    axarr = [a for b in axarr for a in b]

    for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
                              "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr):
        ## Annoying debug messages
        #print("{}".format(prog+sim)),
        #print("{}".format(sim_vcf_dict[prog+sim]))

        coords1, model1 = getPCA(calldata[prog+sim])

        x = coords1[:, 0]
        y = coords1[:, 1]

        ax.scatter(x, y, marker='o')
        ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100))
        ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100))

        for pop in pops.keys():
            flt = np.in1d(np.array(sim_sample_names), pops[pop])
            ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5)
    
        ax.set_title(prog+sim, style="italic")
        ax.axison = True
    f.tight_layout()

Doing - -simno
Doing - -simlo
Doing - -simhi
Doing - -simlarge

Plot pairwise distances for each assembler and each simulated datatype¶

In [65]:

for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
#for sim in ["-simhi"]:
    print("Doing - {}".format(sim))
    f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000)
    axarr = [a for b in axarr for a in b]

    for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \
                              "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr):
        print("{}".format(prog+sim)),
        print("{}".format(sim_vcf_dict[prog+sim]))

        ## Calculate pairwise distances
        dist = getDistances(calldata[prog+sim])

        ## Doing it this way works, but allel uses imshow internally which rasterizes the image
        #allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False)

        ## Create the pcolormesh by hand
        dat = ensure_square(dist)
        
        ## for some reason np.flipud(dat) is chopping off one row of data
        p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\
        cmap="jet", vmin=np.min(dist), vmax=np.max(dist))
        ## Clip all heatmaps to actual sample size
        p.axes.axis("tight")
    
        ax.set_title(prog+sim, style="italic")
        ax.axison = False

Doing - -simno
ipyrad-simno /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simno/simno_outfiles/simno-biallelic.recode.vcf
pyrad-simno /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simno/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simno/batch_1-biallelic.recode.vcf
stacks_gapped-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simno/batch_1-biallelic.recode.vcf
stacks_defualt-simno /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simno/batch_1-biallelic.recode.vcf
ddocent_filt-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simno /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simno /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simno/Formatting/simno-biallelic.recode.vcf
Doing - -simlo
ipyrad-simlo /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlo/simlo_outfiles/simlo-biallelic.recode.vcf
pyrad-simlo /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlo/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlo/batch_1-biallelic.recode.vcf
stacks_gapped-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlo/batch_1-biallelic.recode.vcf
stacks_defualt-simlo /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlo/batch_1-biallelic.recode.vcf
ddocent_filt-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simlo /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simlo /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlo/Formatting/simlo-biallelic.recode.vcf
Doing - -simhi
ipyrad-simhi /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simhi/simhi_outfiles/simhi-biallelic.recode.vcf
pyrad-simhi /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1-biallelic.recode.vcf
stacks_gapped-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simhi/batch_1-biallelic.recode.vcf
stacks_defualt-simhi /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simhi/batch_1-biallelic.recode.vcf
ddocent_filt-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simhi /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simhi /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simhi/Formatting/simhi-biallelic.recode.vcf
Doing - -simlarge
ipyrad-simlarge /home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf
pyrad-simlarge /home/iovercast/manuscript-analysis/pyrad/SIMDATA/simlarge/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf
stacks_ungapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simlarge/batch_1-biallelic.recode.vcf
stacks_gapped-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/gapped/simlarge/batch_1-biallelic.recode.vcf
stacks_defualt-simlarge /home/iovercast/manuscript-analysis/stacks/SIMDATA/default/simlarge/batch_1-biallelic.recode.vcf
ddocent_filt-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/Final.recode-biallelic.recode.vcf
ddocent_full-simlarge /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/TotalRawSNPs-biallelic.recode.vcf
aftrrad-simlarge /home/iovercast/manuscript-analysis/aftrRAD/SIMDATA/simlarge/Formatting/simlarge-biallelic.recode.vcf

Go through and pull in more fine grained results¶

What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them full to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample.

In [71]:

sim_full_loc_cov = collections.OrderedDict()
sim_full_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in sim_vcf_dict.items():
    print("Doing - {}\t".format(prog)),
    sim_full_loc_cov[prog] = []
    sim_full_sample_nlocs[prog] = []

Doing - ipyrad-simlarge	Doing - pyrad-simhi	Doing - pyrad-simlo	Doing - ddocent_full-simlarge	Doing - ddocent_filt-simlo	Doing - stacks_gapped-simno	Doing - stacks_ungapped-simlarge	Doing - aftrrad-simno	Doing - aftrrad-simlo	Doing - ddocent_filt-simno	Doing - stacks_gapped-simlo	Doing - ipyrad-simlo	Doing - stacks_defualt-simhi	Doing - ddocent_filt-simlarge	Doing - stacks_gapped-simhi	Doing - ipyrad-simno	Doing - aftrrad-simlarge	Doing - stacks_ungapped-simhi	Doing - aftrrad-simhi	Doing - stacks_ungapped-simno	Doing - stacks_defualt-simlarge	Doing - stacks_ungapped-simlo	Doing - ipyrad-simhi	Doing - pyrad-simno	Doing - ddocent_full-simhi	Doing - stacks_defualt-simno	Doing - pyrad-simlarge	Doing - ddocent_full-simlo	Doing - stacks_gapped-simlarge	Doing - stacks_defualt-simlo	Doing - ddocent_filt-simhi	Doing - ddocent_full-simno

ipyrad simulated results¶

In [407]:

## ipyrad stats
IPYRAD_SIMOUT=os.path.join(IPYRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
    print("Doing - {}\t".format(sim)),
    simstring = "ipyrad-" + sim
    simdir = os.path.join(IPYRAD_SIMOUT, sim)
    statsfile = simdir + "/{}_outfiles/{}_stats.txt".format(sim, sim)
    infile = open(statsfile).readlines()
    sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]]
    #print(sample_coverage)
    print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
    print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
    sim_full_sample_nlocs[simstring] = sample_coverage
    
    nmissing = [int(x.strip().split()[1]) for x in infile[38:50]]
    sim_full_loc_cov[simstring] = nmissing

## Just look at the ones we care about for ipyrad
print([(x,y) for x,y in sim_full_loc_cov.items() if "ipyrad" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ipyrad" in x])

Doing - simno	mean sample coverage - 10000.0	min/max - 10000/10000	
Doing - simlo	mean sample coverage - 10000.0	min/max - 10000/10000	
Doing - simhi	mean sample coverage - 9999.91666667	min/max - 9999/10000	
Doing - simlarge	mean sample coverage - 96942.6666667	min/max - 96917/96986	
[('ipyrad-simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82659]), ('ipyrad-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad-simhi', [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9998]), ('ipyrad_simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simhi', [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9998]), ('ipyrad_simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82659])]
[('ipyrad-simlarge', [96923, 96959, 96931, 96986, 96972, 96933, 96958, 96945, 96917, 96928, 96926, 96934]), ('ipyrad-simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000])]

pyrad simulated results¶

In [406]:

## ipyrad stats
PYRAD_SIMOUT=os.path.join(PYRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
    simstring = "pyrad-"+sim
    print("Doing - {}\t".format(sim)),
    simdir = os.path.join(PYRAD_SIMOUT, sim)
    statsfile = simdir + "/stats/c85d6m2p3H3N3.stats"
    infile = open(statsfile).readlines()
    sample_coverage = [int(x.strip().split()[1]) for x in infile[8:20]]
    print("mean sample coverage - {}\t".format(np.mean(sample_coverage))),
    print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage)))
    sim_full_sample_nlocs[simstring] = sample_coverage
    
    nmissing = [0] + [int(x.strip().split()[1]) for x in infile[26:37]]
    sim_full_loc_cov[simstring] = nmissing

## Just look at the ones we care about for pyrad
print([(x,y) for x,y in sim_full_loc_cov.items() if "pyrad" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "pyrad" in x])

Doing - simno	mean sample coverage - 10000.0	min/max - 10000/10000	
Doing - simlo	mean sample coverage - 10000.0	min/max - 10000/10000	
Doing - simhi	mean sample coverage - 9999.91666667	min/max - 9999/10000	
Doing - simlarge	mean sample coverage - 96943.6666667	min/max - 96918/96987	
[('ipyrad-simlarge', []), ('pyrad-simhi', [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9998]), ('pyrad-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad-simlo', []), ('ipyrad-simno', []), ('ipyrad-simhi', []), ('pyrad-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('pyrad-simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82660]), ('ipyrad_simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('ipyrad_simhi', [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 9998]), ('ipyrad_simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82659]), ('pyrad_simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('pyrad_simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10000]), ('pyrad_simhi', [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9998]), ('pyrad_simlarge', [0, 27, 61, 600, 158, 80, 274, 1851, 1742, 2385, 10113, 82660])]
[('ipyrad-simlarge', [96923, 96959, 96931, 96986, 96972, 96933, 96958, 96945, 96917, 96928, 96926, 96934]), ('pyrad-simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad-simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('ipyrad-simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad-simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad-simlarge', [96924, 96960, 96932, 96987, 96973, 96934, 96959, 96946, 96918, 96929, 96927, 96935]), ('pyrad_simno', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad_simlo', [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad_simhi', [10000, 10000, 10000, 10000, 10000, 9999, 10000, 10000, 10000, 10000, 10000, 10000]), ('pyrad_simlarge', [96924, 96960, 96932, 96987, 96973, 96934, 96959, 96946, 96918, 96929, 96927, 96935])]

stacks simulated results¶

In [111]:

import gzip

## stacks stats
STACKS_SIMOUT=os.path.join(STACKS_DIR, "SIMDATA/")
STACKS_GAP_SIMOUT=os.path.join(STACKS_SIMOUT, "gapped/")
STACKS_UNGAP_SIMOUT=os.path.join(STACKS_SIMOUT, "ungapped/")
STACKS_DEFAULT_SIMOUT=os.path.join(STACKS_SIMOUT, "default/")

for dir in [STACKS_GAP_SIMOUT, STACKS_UNGAP_SIMOUT, STACKS_DEFAULT_SIMOUT]:
    stacks_method = dir.split("/")[-2]
    for sim in ["simno", "simlo", "simhi", "simlarge"]:
        #print("Doing - {}-{}".format(stacks_method, sim)),
        simstring = "stacks_"+stacks_method+"-"+sim
        try:
            simdir = os.path.join(dir, sim)
            lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines()
            cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
            sim_full_loc_cov[simstring] = [cnts.count(i) for i in range(1,13)]
        except Exception as inst:
            print("loc_cov - {} - {}".format(inst, simdir))

        try:
            sim_full_sample_nlocs[simstring] = []
            samp_haps = glob.glob("{}/*matches*".format(simdir))
            for f in samp_haps:
                lines = gzip.open(f).readlines()
                sim_full_sample_nlocs[simstring].append(len(lines) - 1)
        except Exception as inst:
            print("sample_nlocs - {} - {}".format(inst, simdir))
## Just look at the stacks results to reduce the clutter
print([(x,y) for x,y in sim_full_loc_cov.items() if "stacks" in x])
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "stacks" in x])

[('stacks_gapped-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 336, 9652]), ('stacks_ungapped-simlarge', [49, 29, 67, 593, 156, 92, 311, 1840, 1767, 2674, 12583, 79747]), ('stacks_gapped-simlo', [19, 5, 1, 3, 1, 1, 1, 1, 1, 15, 348, 9623]), ('stacks_defualt-simhi', []), ('stacks_gapped-simhi', [241, 28, 25, 28, 5, 1, 4, 29, 21, 39, 506, 9387]), ('stacks_ungapped-simhi', [1322, 174, 123, 148, 19, 13, 21, 148, 112, 137, 956, 8611]), ('stacks_ungapped-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 336, 9652]), ('stacks_defualt-simlarge', []), ('stacks_ungapped-simlo', [589, 79, 54, 56, 7, 2, 8, 52, 50, 67, 584, 9231]), ('stacks_defualt-simno', []), ('stacks_gapped-simlarge', [49, 29, 67, 593, 156, 92, 311, 1840, 1767, 2674, 12583, 79747]), ('stacks_defualt-simlo', []), ('stacks_default-simno', [3307, 709, 652, 748, 85, 66, 175, 727, 332, 353, 1349, 6446]), ('stacks_default-simlo', [3868, 754, 699, 773, 92, 73, 200, 749, 354, 387, 1488, 6207]), ('stacks_default-simhi', [4592, 837, 740, 833, 105, 90, 239, 783, 390, 465, 1660, 5842]), ('stacks_default-simlarge', [33633, 7241, 6542, 7614, 1102, 1025, 2505, 8307, 4487, 5750, 17674, 53253])]
[('stacks_gapped-simno', [11381, 11435, 11446, 11406, 11400, 11432, 11468, 11442, 11393, 11438, 11466, 11365]), ('stacks_ungapped-simlarge', [110839, 110710, 110603, 110764, 110698, 110572, 110670, 110795, 110708, 110617, 110649, 110860]), ('stacks_gapped-simlo', [11365, 11399, 11420, 11375, 11381, 11395, 11435, 11423, 11354, 11408, 11428, 11343]), ('stacks_defualt-simhi', []), ('stacks_gapped-simhi', [11311, 11365, 11389, 11342, 11332, 11371, 11406, 11400, 11336, 11382, 11424, 11320]), ('stacks_ungapped-simhi', [11371, 11444, 11438, 11413, 11403, 11426, 11466, 11451, 11382, 11428, 11472, 11355]), ('stacks_ungapped-simno', [11381, 11435, 11446, 11406, 11400, 11432, 11468, 11442, 11393, 11438, 11466, 11365]), ('stacks_defualt-simlarge', []), ('stacks_ungapped-simlo', [11388, 11426, 11458, 11407, 11409, 11432, 11471, 11444, 11384, 11434, 11469, 11365]), ('stacks_defualt-simno', []), ('stacks_gapped-simlarge', [110839, 110710, 110603, 110764, 110698, 110572, 110670, 110795, 110708, 110617, 110649, 110860]), ('stacks_defualt-simlo', []), ('stacks_default-simno', [11381, 11434, 11372, 11250, 11132, 10993, 10974, 10982, 11010, 10897, 10824, 10702]), ('stacks_default-simlo', [11385, 11426, 11388, 11256, 11155, 11010, 10991, 10997, 11013, 10905, 10843, 10724]), ('stacks_default-simhi', [11371, 11443, 11368, 11271, 11155, 11017, 11002, 11025, 11022, 10924, 10878, 10739]), ('stacks_gapped-large', []), ('stacks_ungapped-large', []), ('stacks_default-large', []), ('stacks_default-simlarge', [110835, 110690, 109833, 109066, 108063, 106263, 105795, 106033, 106842, 104942, 104216, 104106])]

AftrRAD simulated results¶

AftrRAD doesn't natively support vcf. the vcf files we made are in sim*/Formatting/sim*.vcf

It's not straightforward how to get the locations for snps from the output files. aftrRAD does calculate and write these out as part of the Genotype.pl phase to a file called /TempFiles/SNPLocationsToPlot.txt

It's also not straightforward how to get the number of loci recovered for each invidual. You have to total up the counts from two different files for each individual. The Outputs/Genotypes directory has two files, one for Haplotypes and one for Monomorphics. The haplotypes file is a giant matrix of concatenated snps per locus so you have to search through each column to count up all the NA's (missing data per locus per individual). Then you have to count up the number of non-zero monomorphic sites.

In [192]:

AFTRRAD_SIMOUT=os.path.join(AFTRRAD_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#for sim in ["simlo"]:
    simstring = "aftrrad-{}".format(sim)
    print("Doing - {}".format(simstring))
    simdir = os.path.join(AFTRRAD_SIMOUT, sim)
    ## The `17` in the file name is the min % of samples w/ data
    hapsfile = simdir + "/Output/Genotypes/Haplotypes_17.All.txt"
    monofile = simdir + "/Output/Genotypes/Monomorphics_17.txt"

    ## Set low_memory=False or else simlarge crashes pandas
    mono_df = pd.read_csv(monofile, delim_whitespace=True, header=0, low_memory=False)
    haps_df = pd.read_csv(hapsfile, delim_whitespace=True, header=0, index_col=0, low_memory=False)

    sample_coverage = {}
    ## Get the list of all the simulated sample names
    samplenames = mono_df.columns.values[1:]
    ## Do haplotypes file first because it is phased so there are 2 rows per sample
    ## so we end up double counting NA's. If you read the # of monomorphics first
    ## then you have to figure out some way to prevent the double counting, here
    ## we just count twice and overwrite.
    for row in haps_df.itertuples():        
        sample_coverage[row[0].split("Individual")[1]] = len(pd.Series(row).nonzero()[0])

    for sname in samplenames:
        ## nonzero returns a tuple of which we are only interested in the first element
        sample_coverage[sname] += len(mono_df[sname].nonzero()[0])

    ## To sort or not to sort the coverage_df?
    print("mean sample coverage - {}".format(np.mean(sample_coverage.values()))),
    print("min/max - {}/{}".format(np.min(sample_coverage.values()), np.max(sample_coverage.values())))

    ## Put them in the dict sorted by sample name
    sim_full_sample_nlocs[simstring] = [sample_coverage[x] for x in sorted(sample_coverage.keys())]

    ## Now do locus coverage
    ## For the same reason as above, do the haplotypes file first. We create the tmp_counts
    ## counter and then make a new counter with the index being 1/2 to account for the
    ## doublecounting of NaN in the haplotypes file.
    ## Also, note that annoyingly pandas reads in "NA" as float('nan') rather than string "NA"
    ## which took me a _while_ to figure out.
    hap_nonzero = collections.Counter(haps_df.count())
    tmp_counts = collections.Counter(hap_nonzero)
    hap_counts = collections.Counter()
    for x in tmp_counts:
        hap_counts.update({x/2:tmp_counts[x]})

    ## Get monomorphics
    ## Get the sum of non-zero elements per row (the fancy .ix slicing drops the 
    ## first column which is the sequence)
    mono_nonzero = (mono_df.ix[:, 1:] != 0).astype(int).sum(axis=1)
    mono_counts = collections.Counter(mono_nonzero)

    tot_counts = hap_counts + mono_counts
    ## Collections are unordered, and also any coverage level with no count
    ## will not be in the collection, so we have to order it and insert zeros at the appropriate locations
    dat = []
    for i in xrange(1,13):
        try:
            dat.append(tot_counts[i])
        except:
            dat.append(0)
    sim_full_loc_cov[simstring] = dat

print("Locus coverage")
print([(x,y) for x,y in sim_full_loc_cov.items() if "aftrrad" in x])
print("Sample nloci")
print([(x,y) for x,y in sim_full_sample_nlocs.items() if "aftrrad" in x])

 Doing - aftrrad-simlarge mean sample coverage aftrrad-simlarge - 100172.75 min/max - 100132/100249
Locus coverage
[('aftrrad-simno', [0, 1, 17, 29, 5, 1, 3, 27, 15, 9, 36, 9910]), ('aftrrad-simlo', [0, 3, 20, 37, 7, 0, 4, 34, 18, 16, 58, 9870]), ('aftrrad-simlarge', [0, 19, 226, 838, 213, 117, 332, 2078, 1878, 2529, 10526, 81656]), ('aftrrad-simhi', [0, 11, 35, 57, 5, 1, 5, 53, 30, 22, 129, 9761])]
Sample nloci
[('aftrrad-simno', [10051, 10051, 10051, 10051, 10050, 10050, 10050, 10050, 10052, 10053, 10053, 10052]), ('aftrrad-simlo', [10059, 10060, 10058, 10059, 10059, 10059, 10059, 10058, 10061, 10062, 10062, 10059]), ('aftrrad-simlarge', [100138, 100141, 100137, 100135, 100141, 100134, 100137, 100132, 100244, 100244, 100249, 100241]), ('aftrrad-simhi', [10088, 10089, 10083, 10082, 10087, 10085, 10082, 10081, 10089, 10090, 10087, 10085])]

dDocent simulated results¶

~~Here's what's weird. The dDocent reference.fasta file which is generated by the cd-hit clustering looks like it contains the right number of clusters. Similarly, if you use samtools to view the sample -RG.bam, and the mapped.bed file has 10000 loci as well. But if you look at the Contigs that make it to the final vcf files (either TotalRawSNPs or Final.recode) there are ~100+ contigs missing. I'm sure freebayes is filtering loci, but i diff'd the loci in the bed file and the vcf, and looked at the sequences of one of the missing loci in the cat-RRG.bam file and they look more or less normal. It's not the mapping quality flaq (-m), because all sim seqs have high quality mapping. None of the other settings in the dDocent script or the default freebayes settings seem particularly conspicuous.~~ Monomorphic loci obviously aren't going to end up in the final vcf. It is incredibly not straightforward how to actually get access to the monomorphic loci from the intermediary files that dDocent creates. The shortcuts that dDocent takes to run super fast mean it's really hard to get "stacks" of sequence data per locus (e.g. the ipyrad .loci file).

    grep Contig TotalRawSNPs.vcf | cut -f 1 | uniq -c | wc -l

simno/simlo/simhi - 9887/9889/9890

For simlarge there was a much bigger difference between the TotalRaw and Final.recode vcf files (98350/91620), TotalRaw recovers about 98.4% of true loci, but the final recode catches only about 92%.

It's also not super straightforward how to get # loci per sample from the dDocent output, so I had to gin something up. Right now because it uses samtools to view the bam file and then does some shell manipulation, this is a little slow (20 seconds per simulation treatment), but tolerable.

TODO: This is currently not tracking which samples correspond with the number of loci recovered bcz the ddocent bam files use the weird naming scheme and I haven't back calculated the good names yet.

This isn't exactly technically correct, because this is the counts of loci mapped for each individual, not the number of loci for each individual in the output!

Also, this is a little slow. Takes about 30-40 minutes.

In [ ]:

import subprocess

DDOCENT_SIMOUT=os.path.join(DDOCENT_DIR, "SIMDATA/")
for sim in ["simno", "simlo", "simhi", "simlarge"]:
#for sim in ["simlo"]:
    simdir = os.path.join(DDOCENT_SIMOUT, sim + "/" + sim + "_fastqs/")
    print("Doing {} - {}".format(sim, simdir))

    ## This is hackish. You have to dip into the bam file to get locus counts that
    ## include counts for monomorphics. Note! that field 3 of the bam file is not
    ## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3') 
    ## the read maps to. So really this is kind of by proxy counting the number of loci.
    sample_coverage = []
    for samp in glob.glob(simdir + "/*-RG.bam"):
        cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp)
        res = subprocess.check_output(cmd, shell=True)
        sample_coverage.append(int(res.strip()))
    print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.")
    print("mean sample coverage - {}".format(np.mean(sample_coverage)))
    print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
    
    print("Reading VCF")
    vcf_filt = pd.read_csv("{}/Final.recode.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0)
    vcf_full = pd.read_csv("{}/TotalRawSNPs.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0)

    for vcf_string, vcf_df in zip(["full", "filt"], [vcf_full, vcf_filt]):
        simstring = "ddocent_" + vcf_string + "-" + sim
        print("Doing {}".format(simstring))
        ## Sample coverage is the same for both conditions here, because they are based on the
        ## same raw data.
        idxs = set(vcf_df.index)

        ## Here we are going to simultaneously accumulate information about nloci per sample and locus coverage counts.
        c = []
        sample_cov_counter = collections.Counter()
        for i, idx in enumerate(idxs):
            ## Poor man's progress bar. For the impatient.
            if not i % 1000:
                print(i),

            ## Complicated song and dance here because if the locus only has one snp you have to do
            ## some bullshit to reshape the dataframe.
            tmp_df = vcf_df.loc[idx]
            if isinstance(tmp_df, pd.Series):
                tmp_df = tmp_df.to_frame().T
                
            ## The double apply is applying split() to all rows and columns to get the genotype data
            nonzero_samps = (tmp_df.iloc[:, 8:20].apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").\
                              astype(int).sum().nonzero()[0]
            sample_cov_counter.update(nonzero_samps)
            c.append(len(nonzero_samps))
                
            ## The double apply is applying split() to all rows and columns to get the genotype data
#            count = len((tmp_df.iloc[:, 8:20].\
#                apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").astype(int).sum().nonzero()[0])
#            c.append(count)
        counts = collections.Counter(c)

        ## Fill in zero values that aren't in the locus counter
        dat = []
        for i in xrange(1,13):
            try:
                dat.append(counts[i])
            except:
                dat.append(0)

        sim_full_loc_cov[simstring] = dat
        sim_full_sample_nlocs[simstring] = sample_cov_counter.values()

# Here's a different stupid way to do this...
#
#        sample_counts = {}
#        sample_names = list(vcf.columns)[8:]
#        print(sample_names)
#        print("num loci - {}".format(len(set(vcf.index))))
#        for name in sample_names:
#            print(name)
#            sample_counts[name] = 0
#            for locus in set(vcf.index):
#                snps = vcf[name][locus]
#                if any(map(lambda x: x.split(":")[0] != "./.", snps)):
#                    sample_counts[name] += 1
#                else:
#                    pass
#                    #print("{} {} {}".format(name, locus, snps))
#        print(sample_counts)

    print("Locus coverage")
    print([(x,y) for x,y in sim_full_loc_cov.items() if "ddocent" in x])
    print("Sample nloci")
    print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ddocent" in x])

Doing simno - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simno/simno_fastqs/
This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.
mean sample coverage - 10000.0
min/max - 10000/10000
Reading VCF
Doing ddocent_full-simno
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Doing ddocent_filt-simno
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Locus coverage
[('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])]
Sample nloci
[('ddocent_full-simlarge', []), ('ddocent_filt-simlo', []), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', []), ('ddocent_full-simlo', []), ('ddocent_filt-simhi', []), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])]
Doing simlo - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlo/simlo_fastqs/
This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.
mean sample coverage - 10000.0
min/max - 10000/10000
Reading VCF
Doing ddocent_full-simlo
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Doing ddocent_filt-simlo
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Locus coverage
[('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])]
Sample nloci
[('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', []), ('ddocent_full-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simhi', []), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])]
Doing simhi - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/
This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.
mean sample coverage - 9998.75
min/max - 9997/10000
Reading VCF
Doing ddocent_full-simhi
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Doing ddocent_filt-simhi
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 Locus coverage
[('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])]
Sample nloci
[('ddocent_full-simlarge', []), ('ddocent_filt-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', []), ('ddocent_full-simhi', [9885, 9885, 9886, 9887, 9888, 9888, 9886, 9886, 9886, 9887, 9887, 9888]), ('ddocent_full-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simhi', [9885, 9885, 9885, 9885, 9885, 9885, 9883, 9884, 9884, 9885, 9885, 9885]), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])]
Doing simlarge - /home/iovercast/manuscript-analysis/dDocent/SIMDATA/simlarge/simlarge_fastqs/
This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.
mean sample coverage - 96949.5
min/max - 96919/96988
Reading VCF
Doing ddocent_full-simlarge
0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 92000 93000 94000 95000 96000 97000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 43000 44000 45000 46000 47000 48000 49000 50000 51000 52000 53000 54000 55000 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 66000 67000 68000 69000 70000 71000 72000 73000 74000 75000 76000 77000 78000 79000 80000 81000 82000 83000 84000 85000 86000 87000 88000 89000 90000 91000 Locus coverage
[('ddocent_full-simlarge', [1, 2, 38, 434, 141, 71, 258, 1748, 1703, 2334, 9853, 81766]), ('ddocent_filt-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886]), ('ddocent_filt-simlarge', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9853, 81766]), ('ddocent_full-simhi', [0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 4, 9881]), ('ddocent_full-simlo', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9888]), ('ddocent_filt-simhi', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 9881]), ('ddocent_full-simno', [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9886])]
Sample nloci
[('ddocent_full-simlarge', [95609, 95639, 95606, 95654, 95650, 95609, 95639, 95625, 95492, 95499, 95494, 95502]), ('ddocent_filt-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886]), ('ddocent_filt-simlarge', [91091, 91119, 90741, 90312, 91116, 91065, 90744, 90308, 91090, 91095, 90682, 90212]), ('ddocent_full-simhi', [9885, 9885, 9886, 9887, 9888, 9888, 9886, 9886, 9886, 9887, 9887, 9888]), ('ddocent_full-simlo', [9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888, 9888]), ('ddocent_filt-simhi', [9885, 9885, 9885, 9885, 9885, 9885, 9883, 9884, 9884, 9885, 9885, 9885]), ('ddocent_full-simno', [9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886, 9886])]

Write out results of the simulations¶

This data is worth its weight in gold, it takes forever to acquire, so lets save it off to disk so we don't lose it.

In [502]:

import pickle
sim_res_dict = {"sim_sample_nlocs_df":sim_full_sample_nlocs, "sim_full_loc_cov":sim_full_loc_cov}

for k,v in sim_res_dict.items():
    print(k)
    pickle.dump(v, open(WORK_DIR + "RESULTS/" + k + ".p", 'wb'))

sim_full_loc_cov
sim_sample_nlocs_df

Plotting simulation results¶

In [443]:

simlevel = ["simno", "simlo", "simhi", "simlarge"]
assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\
              "stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"]

sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel)

for sim in simlevel:
    for ass in assemblers:
        simstring = "-".join([ass, sim])
        sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring])
print("Mean number of loci recovered per sample.")
## Normalize all bins
dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float)
for sim in simlevel:
    scale = 10000
    if sim == "simlarge":
        scale = 100000
    dat[sim] = dat[sim]/scale
sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True)
print(sim_sample_nlocs_df)
#print(dat)

Mean number of loci recovered per sample.
                   simno    simlo    simhi simlarge
ipyrad             10000    10000  9999.92  96942.7
pyrad              10000    10000  9999.92  96943.7
stacks_gapped    11422.7  11393.8  11364.8   110707
stacks_ungapped  11422.7  11423.9  11420.8   110707
stacks_default   11079.2  11091.1  11101.2   107224
aftrrad          10051.2  10059.6  10085.7   100173
ddocent_full        9886     9888  9886.58  95584.8
ddocent_filt        9886     9888  9884.67  90797.9

Ugly plot for locus coverage.¶

In [552]:

simlevel = ["simno", "simlo", "simhi", "simlarge"]
assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\
              "stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"]
df = pd.DataFrame(index=assemblers, columns=simlevel)
df2 = pd.DataFrame(columns = ["assembler", "simtype"] + [x for x in xrange(1,13)])
df3 = pd.DataFrame(columns = ["assembler", "simtype", "simdata"])
## Try to get a dataframe in the shape seaborn will be happy with
dfsns = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])

for sim in simlevel:
    for ass in assemblers:
        simstring = ass + "-" + sim
        df[sim][ass] = np.array(sim_full_loc_cov[simstring])
        df2.loc[simstring] = [ass, sim] + sim_full_loc_cov[simstring]
        df3.loc[simstring] = [ass, sim, np.array(sim_full_loc_cov[simstring])]

        for i, val in enumerate(sim_full_loc_cov[simstring]):
            ## Normalize values so different sim sizes print the same
            max = 10000.
            if "large" in simstring:
                max = 100000.
            dfsns.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val/max]

sns.set(style="white")

## This kinda sucks, too spread out.
#g = sns.FacetGrid(dfsns, row="assembler", col="simtype", margin_titles=True)
#g.map(sns.barplot, "bin", "simdata", color="steelblue", lw=0)
plt.rcParams['figure.figsize']=(20,20)
g = sns.FacetGrid(dfsns, col="simtype", margin_titles=True, col_wrap=2, size=4, aspect=2)
g.map(sns.barplot, "bin", "simdata", "assembler", lw=0.5, palette="bright").add_legend()

Out[552]:

<seaborn.axisgrid.FacetGrid at 0x2aab7b71a7d0>

Another ugly plot for locus coverage.¶

In [553]:

## Barplots of assembly method x simdata type
g = sns.FacetGrid(dfsns, col="assembler", margin_titles=True, col_wrap=4, size=3, aspect=1.5)
g.map(sns.barplot, "bin", "simdata", "simtype", lw=0.5, palette="bright").add_legend()

Out[553]:

<seaborn.axisgrid.FacetGrid at 0x2aab7b85fcd0>

Much better! Do the spline plot version of the above plot¶

It just looks nicer, makes more sense.

In [598]:

dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"])


for sim in simlevel:
    for ass in assemblers:
        simstring = ass + "-" + sim
        ## Normalize values so different sim sizes print the same
        max = 10000.
        if "large" in simstring:
            max = 100000.
        
        newdat = sim_full_loc_cov[simstring]
        newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)]
        for i, val in enumerate(newdat):
            dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val]

g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", sharex=True, sharey=False, size=4, col_wrap=2)
g.map(plt.scatter, "bin", "simdata")
g.map(plt.plot, "bin", "simdata").add_legend()
axs = g.axes
for ax in axs:
    ax.set_xlim(0,12.5)

Empirical Results (Pedicularis)¶

Process the vcf output from all the runs¶

Here we'll pull together all the output vcf files and filter out everything except biallelic snps. This takes a few minutes to run.

In [603]:

vcf_dict = {}
vcf_dict["ipyrad"] = os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA.vcf")
vcf_dict["pyrad"] = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf")
vcf_dict["stacks_ungapped"] = os.path.join(STACKS_DIR, "REALDATA/ungapped/batch_1.vcf")
vcf_dict["stacks_gapped"] = os.path.join(STACKS_DIR, "REALDATA/gapped/batch_1.vcf")
vcf_dict["stacks_default"] = os.path.join(STACKS_DIR, "REALDATA/default/batch_1.vcf")
vcf_dict["aftrrad"] = os.path.join(AFTRRAD_DIR, "REALDATA/Formatting/REALDATA.vcf")
vcf_dict["ddocent_full"] = os.path.join(DDOCENT_DIR, "REALDATA/TotalRawSNPs.vcf")
vcf_dict["ddocent_filt"] = os.path.join(DDOCENT_DIR, "REALDATA/Final.recode.vcf")
for k, f in vcf_dict.items():
    if os.path.exists(f):
        print("found - {}".format(f))
        
        ## If it's gzipped then unzip it (only applies to ipyrad i think)
        if ".gz" in f:
            print("gunzipping")
            cmd = "gunzip {}".format(f)
            !cmd
            vcf_dict[k] = f.split(".gz")[0]
            
        ## Remove all but biallelic (for ipyrad this also removes all the monomorphic)
        #outfile = f.split(".vcf")[0]+"-biallelic"
        #cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \
        #    .format(DDOCENT_DIR, f, outfile)
        #print(cmd)
        #!$cmd
        
        ## update the vcf_dict
        #vcf_dict[k] = outfile + ".recode.vcf"
    else:
        print("not found - {}".format(f))

found - /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1.vcf
/home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1-biallelic

VCFtools - v0.1.11
(C) Adam Auton 2009

Parameters as interpreted:
	--vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1.vcf
	--max-alleles 2
	--min-alleles 2
	--out /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1-biallelic
	--recode

Reading Index file.
File contains 149570 entries and 13 individuals.
Applying Required Filters.
Filtering sites by number of alleles
After filtering, kept 13 out of 13 Individuals
After filtering, kept 149570 out of a possible 149570 Sites
Outputting VCF file... Done
Run Time = 5.00 seconds
found - /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs.vcf
/home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs-biallelic

VCFtools - v0.1.11
(C) Adam Auton 2009

Parameters as interpreted:
	--vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs.vcf
	--max-alleles 2
	--min-alleles 2
	--out /home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs-biallelic
	--recode

Reading Index file.
File contains 238142 entries and 13 individuals.
Applying Required Filters.
Filtering sites by number of alleles
After filtering, kept 13 out of 13 Individuals
After filtering, kept 190405 out of a possible 238142 Sites
Outputting VCF file... Done
Run Time = 15.00 seconds
found - /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3.vcf
/home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3-biallelic

VCFtools - v0.1.11
(C) Adam Auton 2009

Parameters as interpreted:
	--vcf /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3.vcf
	--max-alleles 2
	--min-alleles 2
	--out /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3-biallelic
	--recode

Reading Index file.
File contains 319936 entries and 13 individuals.
Applying Required Filters.
Filtering sites by number of alleles
After filtering, kept 13 out of 13 Individuals
After filtering, kept 308247 out of a possible 319936 Sites
Outputting VCF file... Done
Run Time = 5.00 seconds
not found - /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf
found - /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA.vcf
/home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA-biallelic

VCFtools - v0.1.11
(C) Adam Auton 2009

Parameters as interpreted:
	--vcf /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA.vcf
	--max-alleles 2
	--min-alleles 2
	--out /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA-biallelic
	--recode

Reading Index file.
File contains 4554084 entries and 13 individuals.
Applying Required Filters.
Filtering sites by number of alleles
After filtering, kept 13 out of 13 Individuals
After filtering, kept 220118 out of a possible 4554084 Sites
Outputting VCF file... Done
Run Time = 17.00 seconds
not found - /home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/batch_1.vcf
found - /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode.vcf
/home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode-biallelic

VCFtools - v0.1.11
(C) Adam Auton 2009

Parameters as interpreted:
	--vcf /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode.vcf
	--max-alleles 2
	--min-alleles 2
	--out /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode-biallelic
	--recode

Reading Index file.
File contains 99971 entries and 13 individuals.
Applying Required Filters.
Filtering sites by number of alleles
After filtering, kept 13 out of 13 Individuals
After filtering, kept 80040 out of a possible 99971 Sites
Outputting VCF file... Done
Run Time = 6.00 seconds
found - /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1.vcf
/home/iovercast/manuscript-analysis/dDocent/vcftools --vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1.vcf --min-alleles 2 --max-alleles 2 --recode --out /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1-biallelic

VCFtools - v0.1.11
(C) Adam Auton 2009

Parameters as interpreted:
	--vcf /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1.vcf
	--max-alleles 2
	--min-alleles 2
	--out /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1-biallelic
	--recode

Reading Index file.
Building new index file.
	Scanning Chromosome: un
Writing Index file.
File contains 344633 entries and 13 individuals.
Applying Required Filters.
Filtering sites by number of alleles
After filtering, kept 13 out of 13 Individuals
After filtering, kept 344633 out of a possible 344633 Sites
Outputting VCF file... Done
Run Time = 11.00 seconds

Read in vcf for each analysis and pull in coverage/depth stats¶

In [604]:

emp_loc_cov = collections.OrderedDict()
emp_snp_cov = collections.OrderedDict()
emp_sample_nsnps = collections.OrderedDict()
emp_sample_nlocs = collections.OrderedDict()
## Try just doing them all the same
for prog, filename in vcf_dict.items():
    try:
        print("Doing - {}".format(prog))
        print("\t{}".format(filename))
        v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
        c = vcfnp.calldata_2d(filename).view(np.recarray)

        emp_loc_cov[prog] = loci_coverage(v, c, prog)
        emp_snp_cov[prog] = snp_coverage(c)
        emp_sample_nsnps[prog] = sample_nsnps(c)
        emp_sample_nlocs[prog] = sample_nloci(v, c, prog)
        
        plotPCA(c, prog)
        plotPairwiseDistance(c, prog)
    except Exception as inst:
        print(inst)

[vcfnp] 2016-10-18 11:08:57.760390 :: caching is disabled
[vcfnp] 2016-10-18 11:08:57.760901 :: building array

Doing - stacks_default
	/home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1-biallelic.recode.vcf

[vcfnp] 2016-10-18 11:08:59.403870 :: caching is disabled
[vcfnp] 2016-10-18 11:08:59.404332 :: building array
[vcfnp] 2016-10-18 11:09:19.246532 :: caching is disabled
[vcfnp] 2016-10-18 11:09:19.247164 :: building array

Doing - ddocent_full
	/home/iovercast/manuscript-analysis/dDocent/REALDATA/TotalRawSNPs-biallelic.recode.vcf

[vcfnp] 2016-10-18 11:09:25.020836 :: caching is disabled
[vcfnp] 2016-10-18 11:09:25.021373 :: building array
[vcfnp] 2016-10-18 11:10:08.979173 :: caching is disabled
[vcfnp] 2016-10-18 11:10:08.979877 :: building array

Doing - pyrad
	/home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3-biallelic.recode.vcf

[vcfnp] 2016-10-18 11:10:12.003180 :: caching is disabled
[vcfnp] 2016-10-18 11:10:12.003658 :: building array
[vcfnp] 2016-10-18 11:10:36.855671 :: caching is disabled
[vcfnp] 2016-10-18 11:10:36.856239 :: building array

Doing - aftrrad
	/home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf
file not found: /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf
Doing - ipyrad
	/home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA-biallelic.recode.vcf

[vcfnp] 2016-10-18 11:10:39.041659 :: caching is disabled
[vcfnp] 2016-10-18 11:10:39.042141 :: building array
[vcfnp] 2016-10-18 11:10:57.915668 :: caching is disabled
[vcfnp] 2016-10-18 11:10:57.916290 :: building array

Doing - stacks_gapped
	/home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/batch_1.vcf
file not found: /home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/batch_1.vcf
Doing - ddocent_filt
	/home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.recode-biallelic.recode.vcf

[vcfnp] 2016-10-18 11:11:00.415481 :: caching is disabled
[vcfnp] 2016-10-18 11:11:00.416015 :: building array
[vcfnp] 2016-10-18 11:11:19.294158 :: caching is disabled
[vcfnp] 2016-10-18 11:11:19.294788 :: building array

Doing - stacks_ungapped
	/home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1-biallelic.recode.vcf

[vcfnp] 2016-10-18 11:11:23.090947 :: caching is disabled
[vcfnp] 2016-10-18 11:11:23.091441 :: building array

In [608]:

for i in emp_sample_nlocs:
    print(i),
    print(emp_sample_nlocs[i])

stacks_default [22215, 32074, 25377, 16640, 23557, 17426, 32172, 34354, 31333, 29809, 34792, 24791, 25256]
ddocent_full [34816, 37668, 35523, 25583, 36635, 25751, 38051, 39179, 38946, 38744, 39209, 37612, 37240]
pyrad [22826, 34866, 30179, 19710, 19076, 21977, 36830, 36774, 37212, 29720, 38012, 35350, 31155]
ipyrad [18710, 28396, 24635, 15667, 15301, 17830, 29885, 29492, 30213, 24043, 30671, 29076, 25580]
ddocent_filt [18640, 18904, 18745, 18471, 18713, 18553, 18969, 19050, 19096, 19068, 19086, 19182, 19164]
stacks_ungapped [23103, 35494, 30792, 19377, 19547, 21469, 36663, 36949, 36703, 29789, 37968, 33544, 31931]

ipyrad Empirical results¶

First we load in the variant info and call data for all the snps

In [70]:

IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/")
IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt")

infile = open(IPYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))

[21163, 31863, 27902, 20852, 17779, 23008, 33289, 33697, 35765, 29184, 35096, 38144, 34746]
mean sample coverage - 29422.1538462
min/max - 17779/38144

Read the biallelic vcf

In [71]:

filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf")
# filename = vcf_dict["ipyrad"]
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)

[vcfnp] 2016-10-12 12:42:26.099077 :: caching is disabled
[vcfnp] 2016-10-12 12:42:26.099836 :: building array
[vcfnp] 2016-10-12 12:42:29.441796 :: caching is disabled
[vcfnp] 2016-10-12 12:42:29.442748 :: building array

Distribution of snps along loci¶

Getting variable sites and parsimony informative sites from the vcf is kind of annoying because all the programs export slightly different formats, so you need to parse them in slightly different ways. There's a better way to do this for ipyrad but i figure i'll do it the same way for all of them so it's more clear what's happening.

In [72]:

## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()

## Now have to massage this into the list of counts per site in increasing order 
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]

## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]

canvas, axes = SNP_position_plot(distvar, distpis)

## save fig
#toyplot.html.render(canvas, 'snp_positions.html')

canvas

Out[72]:

In [113]:

## Have to gunzip the ipyrad REALDATA vcf
plotPCA(c, "ipyrad")

In [114]:

plotPairwiseDistance(c, "pyrad")

pyRAD empirical results¶

TODO: Figure out why vcfnp is failing to read in pyrad vcf¶

In [206]:

PYRAD_EMPIRICAL_OUTPUT=os.path.join(PYRAD_DIR, "REALDATA/")
PYRAD_STATS = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "stats/c85d6m2p3H3N3.stats")

infile = open(PYRAD_STATS).readlines()
sample_coverage = [int(x.strip().split()[1]) for x in infile[8:21]]
print(sample_coverage)
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))

[23924, 37002, 31852, 23698, 20063, 26050, 38931, 39423, 41250, 33230, 40809, 42948, 38645]
mean sample coverage - 33678.8461538
min/max - 20063/42948

Pull in the pyrad vcf¶

Be careful because sometimes the pyrad vcf gets a newline inserted between the last line of metadata and the first line of genotypes, which causes vcfnp to silently fail. Also there are a couple flags you can pass to vcfnp for debugging, but the print hella data to the console.

v = vcfnp.variants(filename, progress=1, verbose=True).view(np.recarray)

In [230]:

filename = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)

[vcfnp] 2016-10-08 15:40:30.499958 :: caching is disabled
[vcfnp] 2016-10-08 15:40:30.500883 :: building array
[vcfnp] 2016-10-08 15:40:37.399061 :: caching is disabled
[vcfnp] 2016-10-08 15:40:37.417059 :: building array

Distribution of snps along loci¶

In [231]:

## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed()

## Now have to massage this into the list of counts per site in increasing order 
## of position across the locus
distpis = Counter([int(x) for x in pis])
#distpis = [x for x in sorted(counts.items())]

## Getting the distvar is easier
distvar = Counter([int(x) for x in v.POS])
#distvar = [x for x in sorted(counts.items())]

canvas, axes = SNP_position_plot(distvar, distpis)

## save fig
#toyplot.html.render(canvas, 'snp_positions.html')

canvas

Out[231]:

stacks empirical results (ungapped)¶

In [12]:

STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/")
STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/")
STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/")
STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/")

#lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines()
#cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]]
#shigh = [cnts.count(i) for i in range(1,13)]

In [13]:

filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)

[vcfnp] 2016-10-12 10:48:17.512074 :: caching is disabled
[vcfnp] 2016-10-12 10:48:17.512879 :: building array
[vcfnp] 2016-10-12 10:48:25.055213 :: caching is disabled
[vcfnp] 2016-10-12 10:48:25.061273 :: building array

Distribution of snps along loci¶

In [14]:

## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()

## Now have to massage this into the list of counts per site in increasing order 
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]

## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]

canvas, axes = SNP_position_plot(distvar, distpis)

## save fig
#toyplot.html.render(canvas, 'snp_positions.html')

canvas

Out[14]:

In [15]:

plotPCA(c, "stacks ungapped")

In [16]:

plotPairwiseDistance(c, "stacks ungapped")

[vcfnp] 2016-09-03 19:04:05.829749 :: caching is disabled
[vcfnp] 2016-09-03 19:04:05.830278 :: building array

stacks empirical results (defaults)¶

In [16]:

filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)

[vcfnp] 2016-10-12 10:53:17.257960 :: caching is disabled
[vcfnp] 2016-10-12 10:53:17.259180 :: building array
[vcfnp] 2016-10-12 10:53:20.403885 :: caching is disabled
[vcfnp] 2016-10-12 10:53:20.409034 :: building array

Some informative stats¶

TODO

Distribution of snps along loci¶

In [17]:

## Get only parsimony informative sites
## Get T/F values for whether each genotype is ref or alt across all samples/loci
is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"])
## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
## Create a T/F mask for snps that are informative
only_pis = map(lambda x: x < 2, alt_counts)
## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
## Also, compress() the masked array so we only actually see the pis
pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed()

## Now have to massage this into the list of counts per site in increasing order 
## of position across the locus
distpis = Counter([int(x.split("_")[1]) for x in pis])
#distpis = [x for x in sorted(counts.items())]

## Getting the distvar is easier
distvar = Counter([int(x.split("_")[1]) for x in v.ID])
#distvar = [x for x in sorted(counts.items())]

canvas, axes = SNP_position_plot(distvar, distpis)

## save fig
toyplot.html.render(canvas, 'snp_positions.html')

canvas

Out[17]:

In [18]:

plotPCA(c, "stacks default")

In [12]:

plotPairwiseDistance(c, "stacks default")

[vcfnp] 2016-09-03 18:56:31.855138 :: caching is disabled
[vcfnp] 2016-09-03 18:56:31.855756 :: building array

Out[12]:

<matplotlib.axes._subplots.AxesSubplot at 0x2aab15d09e90>

AftrRAD empirical results¶

In [ ]:

AFTRRAD_OUTPUT=os.path.join(AFTRRAD_DIR, "REALDATA/")

In [ ]:

filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf")
v = vcfnp.variants(filename).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)

Distribution of snps along loci¶

NOT TESTED

It's not very straightforward how to extract information about whether each snp is informative or not, given the output files that aftrRAD generates. It does create a file with all the snp locations so we can use this to generate the distribution of variable sites, but the pis will be empty.

In [184]:

f = os.path.join(AFTRRAD_OUTPUT, "TempFiles/SNPLocationsToPlot.txt")
with open(f) as infile:
    snplocs = infile.read().split()
## Now have to massage this into the list of counts per site in increasing order 
## of position across the locus
distpis = Counter([])

## Getting the distvar is easier
distvar = Counter(map(int, snplocs))

canvas, axes = SNP_position_plot(distvar, distpis)

## save fig
## toyplot.html.render(canvas, 'snp_positions.html')

canvas

Out[184]:

dDocent empirical results¶

HAVE TO MAKE SURE THE SAMPLES IN THE DDOCENT OUTPUT VCF ARE IN THE SAME ORDER AS THOSE IN THE IPYRAD/STACKS There is now code in the horserace notebook to rename samples in the ddocent vcf and to reorder the samples to match ipyrad/stacks.

In [ ]:

import subprocess

DDOCENT_OUTPUT=os.path.join(DDOCENT_DIR, "REALDATA/")
os.chdir(DDOCENT_OUTPUT)
print("Getting max loci per sample from bam files.")
sample_coverage = []
for samp in glob.glob("*-RG.bam"):
    cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp)
    res = subprocess.check_output(cmd, shell=True)
    print("nloci {} - {}".format(res.strip(), samp))
    sample_coverage.append(int(res.strip()))
    
print(sorted(sample_coverage, reverse=True))
    
print("mean sample coverage - {}".format(np.mean(sample_coverage)))
print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage)))
#sim_coverage_df["ddocent_"+sim] = sample_coverage
    
vcf_filt = pd.read_csv("Final.recode.vcf", delim_whitespace=True, header=60, index_col=0)
vcf_full = pd.read_csv("TotalRawSNPs.vcf", delim_whitespace=True, header=60, index_col=0)

## There's gotta be a faster way to do this...
## It's really fuckin slow on real data, like hours.
for vcf in [vcf_full, vcf_filt]:
    sample_counts = {}
    sample_names = list(vcf.columns)[8:]
    print(sample_names)
    print("num loci in vcf - {}".format(len(set(vcf.index))))
    for name in sample_names:
        print(name)
        sample_counts[name] = 0
        for locus in set(vcf.index):
            snps = vcf[name][locus]
            if any(map(lambda x: x.split(":")[0] != "./.", snps)):
                sample_counts[name] += 1
            else:
                pass
    print(sample_counts)
print(sim_coverage_df)

Getting max loci per sample from bam files.
nloci 37358 - Pop1_Sample1-RG.bam
nloci 41001 - Pop3_Sample1-RG.bam
nloci 29663 - Pop2_Sample2-RG.bam
nloci 42706 - Pop3_Sample2-RG.bam
nloci 29490 - Pop2_Sample1-RG.bam
nloci 38202 - Pop4_Sample2-RG.bam
nloci 42543 - Pop3_Sample3-RG.bam
nloci 42667 - Pop3_Sample4-RG.bam
nloci 42999 - Pop3_Sample5-RG.bam
nloci 43950 - Pop4_Sample1-RG.bam
nloci 44300 - Pop4_Sample3-RG.bam
nloci 40740 - Pop5_Sample1-RG.bam
nloci 39368 - Pop5_Sample2-RG.bam
[44300, 43950, 42999, 42706, 42667, 42543, 41001, 40740, 39368, 38202, 37358, 29663, 29490]
mean sample coverage - 39614.3846154
min/max - 29490/44300

In [171]:

## Basic difference btw Final.recode and TotalRawSNPs is that final recode
## only includes individuals with snps called in > 90% of samples

f1 = os.path.join(DDOCENT_OUTPUT, "TotalRawSNPs.vcf")
v_full = vcfnp.variants(f1, dtypes={"CHROM":"a24"}).view(np.recarray)
c_full = vcfnp.calldata_2d(f1).view(np.recarray)
f2 = os.path.join(DDOCENT_OUTPUT, "Final.recode.vcf")
v_filt = vcfnp.variants(f2, dtypes={"CHROM":"a24"}).view(np.recarray)
c_filt = vcfnp.calldata_2d(f2).view(np.recarray)

[vcfnp] 2016-10-15 15:13:53.339632 :: caching is disabled
[vcfnp] 2016-10-15 15:13:53.340435 :: building array
[vcfnp] 2016-10-15 15:14:20.818988 :: caching is disabled
[vcfnp] 2016-10-15 15:14:20.832036 :: building array
[vcfnp] 2016-10-15 15:15:48.271026 :: caching is disabled
[vcfnp] 2016-10-15 15:15:48.272360 :: building array
[vcfnp] 2016-10-15 15:15:59.214829 :: caching is disabled
[vcfnp] 2016-10-15 15:15:59.224579 :: building array

Distribution of snps along loci¶

In [75]:

for dset in [[v_full, c_full], [v_filt, c_filt]]:
    ## Get only parsimony informative sites
    ## Get T/F values for whether each genotype is ref or alt across all samples/loci
    is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), dset[1]["genotype"])
    ## Count the number of alt alleles per snp (we only want to retain when #alt > 1)
    alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele)
    ## Create a T/F mask for snps that are informative
    only_pis = map(lambda x: x < 2, alt_counts)
    ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus
    ## Also, compress() the masked array so we only actually see the pis
    pis = np.ma.array(np.array(dset[0]["POS"]), mask=only_pis).compressed()

    ## Now have to massage this into the list of counts per site in increasing order 
    ## of position across the locus
    distpis = Counter([int(x) for x in pis])

    ## Getting the distvar is easier
    distvar = Counter([int(x) for x in v_filt.POS])

    canvas, axes = SNP_position_plot(distvar, distpis)

    ## save fig
    ## toyplot.html.render(canvas, 'snp_positions.html')

    canvas

In [82]:

#Tmp delete me
print(np.sum(distvar.values()))
print(np.sum(distpis.values()))

99971
60996

In [238]:

for dset in [[v_full, c_full], [v_filt, c_filt]]:
    plotPCA(dset[1], "ddocent")

In [239]:

plotPairwiseDistance(c_full, "dDocent Full")
plotPairwiseDistance(c_filt, "dDocent Filtered")

Everything Below here is crap!¶

In [584]:

## Testing for different vcf formats
prog = "stacks-simhi"
filename = "/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1.vcf"
#prog = "dDocent-simhi"
#filename = "/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs.vcf"
#prog = "ipyrad-simlarge"
#filename = "/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf"
prog = "pyrad-simhi"
filename = "/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3.vcf"
v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray)
c = vcfnp.calldata_2d(filename).view(np.recarray)

print("loc_cov", loci_coverage(v, c, prog))
print("snp_cov", snp_coverage(c))
print("snps", sample_nsnps(c))
print("nlocs", sample_nloci(v, c, prog))

[vcfnp] 2016-10-17 14:32:50.401685 :: caching is disabled
[vcfnp] 2016-10-17 14:32:50.402203 :: building array
[vcfnp] 2016-10-17 14:32:50.880046 :: caching is disabled
[vcfnp] 2016-10-17 14:32:50.880506 :: building array

('loc_cov', [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 9881])
('snp_cov', [1, 10, 6, 6, 1, 14, 0, 10, 7, 6, 61, 43054])
('snps', [43139, 43139, 43135, 43141, 43134, 43139, 43139, 43139, 43144, 43141, 43143, 43141])
('nlocs', [9883, 9883, 9883, 9883, 9883, 9882, 9883, 9883, 9883, 9883, 9883, 9883])

Everything Below here is crap!¶

In [49]:

## Maybe useful
print(v.ID)
print(len(v.ID))
print(len(set([x.split("_")[0] for x in v.ID])))

['1_6' '1_14' '1_27' ..., '614361_45' '614386_71' '614406_58']
344633
76984

In [37]:

import itertools
import numpy.ma as ma
print(v.ID)
print(len(v.ID))
loc_list = []
loci = set([x.split("_")[0] for x in v.ID])
print(len(loci))
loc_list = [list(val) for key,val in itertools.groupby(v.ID,key=lambda x:x.split("_")[0])]

print(len(loc_list))
## experimentation
print(v.dtype)
print(c.dtype)

print(np.mean(c.DP))
print(loc_list[0:10])
subsamp = np.array([np.random.choice(x, 1, replace=False)[0] for x in loc_list])
mask = np.in1d(v.ID, subsamp)

['4_35' '4_62' '5_56' ..., '496594_58' '496602_66' '496604_59']
149570
82955
82955
(numpy.record, [('CHROM', 'S12'), ('POS', '<i4'), ('ID', 'S12'), ('REF', 'S12'), ('ALT', 'S12'), ('QUAL', '<f4'), ('FILTER', [('PASS', '?')]), ('num_alleles', 'u1'), ('is_snp', '?'), ('svlen', '<i4'), ('AF', '<f2'), ('NS', '<i4')])
(numpy.record, [('is_called', '?'), ('is_phased', '?'), ('genotype', 'i1', (2,)), ('AD', '<u2', (2,)), ('DP', '<u2'), ('GL', '<f4'), ('GT', 'S3')])
5.95418198837
[['4_35', '4_62'], ['5_56', '5_62', '5_68'], ['6_61', '6_68'], ['7_67'], ['8_5', '8_41', '8_65'], ['11_40'], ['13_49', '13_71'], ['15_6', '15_10', '15_24', '15_34'], ['16_26'], ['17_17', '17_26', '17_27', '17_66']]

In [55]:

g = allel.GenotypeArray(cmask.genotype)
gn = g.to_n_alt()
m = allel.stats.rogers_huff_r(gn[:1000]) ** 2
ax = allel.plot.pairwise_ld(m)

In [ ]:

## Some crap from when i was counting loci a very stupid and slow way for stacks.
        ### This doesn't' count monomorphics, but could be useful still. It's dog slow.
            #vcf = pd.read_csv("{}/batch_1.vcf".format(simdir), delim_whitespace=True, header=9, index_col=2)
            #sample_counts = {}
            #sample_names = list(vcf.columns)[8:]
            #print(sample_names)
            #uniq_loci = set([x.split("_")[0] for x in vcf.index])
            #print("num loci - {}".format(len(uniq_loci)))
            ## Replace the ID index in the stacks vcf file so we can actually evaluate loci
            #vcf.index = pd.Index([x.split("_")[0] for x in vcf.index])
            #for name in sample_names:
            #    print(name),
            #    sample_counts[name] = 0
            #    for locus in uniq_loci:
            #        snps = vcf[name][locus]
            #        if any(map(lambda x: x.split(":")[0] != "./.", snps)):
            #            sample_counts[name] += 1
            #        else:
            #            pass
            #        #print("{} {} {}".format(name, locus, snps))
            #print(sample_counts)
            #sim_full_sample_nlocs["stacks_"+stacks_method+"-"+sim] = sample_coverage
 

In [191]:

filename = os.path.join(DDOCENT_OUTPUT, "Final.recode.snps.vcf")
v_filt = vcfnp.variants(filename).view(np.recarray)
c_filt = vcfnp.calldata_2d(filename).view(np.recarray)

[vcfnp] 2016-10-06 21:43:30.965523 :: caching is disabled
[vcfnp] 2016-10-06 21:43:30.966488 :: building array
[vcfnp] 2016-10-06 21:43:42.021230 :: caching is disabled
[vcfnp] 2016-10-06 21:43:42.029759 :: building array

In [220]:

from collections import Counter
print(v_filt.dtype)
#wat = Counter(vcall["POS"])
#wat
## DDOCENT
v_filt["is_snp"]
print(v_filt[0:2])
print(v_filt["NS"][:40])
print(v_filt["NUMALT"][:40])
print(v_filt["ALT"][:40])
loci = set(v_filt["CHROM"])

(numpy.record, [('CHROM', 'S24'), ('POS', '<i4'), ('ID', 'S12'), ('REF', 'S12'), ('ALT', 'S12'), ('QUAL', '<f4'), ('FILTER', [('PASS', '?')]), ('num_alleles', 'u1'), ('is_snp', '?'), ('svlen', '<i4'), ('AB', '<f4'), ('ABP', '<f4'), ('AC', '<u2'), ('AF', '<f2'), ('AN', '<u2'), ('AO', '<i4'), ('CIGAR', 'S12'), ('DP', '<i4'), ('DPB', '<f4'), ('DPRA', '<f4'), ('END', '<i4'), ('EPP', '<f4'), ('EPPR', '<f4'), ('GTI', '<i4'), ('LEN', '<i4'), ('MEANALT', '<f4'), ('MIN_DP', '<i4'), ('MQM', '<f4'), ('MQMR', '<f4'), ('NS', '<i4'), ('NUMALT', '<i4'), ('ODDS', '<f4'), ('PAIRED', '<f4'), ('PAIREDR', '<f4'), ('PAO', '<f4'), ('PQA', '<f4'), ('PQR', '<f4'), ('PRO', '<f4'), ('QA', '<i4'), ('QR', '<i4'), ('RO', '<i4'), ('RPL', '<f4'), ('RPP', '<f4'), ('RPPR', '<f4'), ('RPR', '<f4'), ('RUN', '<i4'), ('SAF', '<i4'), ('SAP', '<f4'), ('SAR', '<i4'), ('SRF', '<i4'), ('SRP', '<f4'), ('SRR', '<i4'), ('TYPE', 'S12'), ('technology.Illumina', '<f4')])
[ ('dDocent_Contig_1', 7, '.', 'T', 'A', 9108.900390625, (False,), 3, True, 0, 0.5513780117034912, 12.15880012512207, 15, 0.5771484375, 26, 286, '1X', 466, 466.0, 0.0, 0, 624.051025390625, 317.8739929199219, 0, 1, 1.1538499593734741, 0, 55.307701110839844, 60.0, 13, 2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 11292, 5756, 145, 0.0, 624.051025390625, 317.8739929199219, 286.0, 1, 286, 624.051025390625, 0, 145, 317.8739929199219, 0, 'snp', 1.0)
 ('dDocent_Contig_1', 14, '.', 'CTT', 'CTA', 10709.2001953125, (False,), 3, False, 0, 0.6491230130195618, 80.07849884033203, 16, 0.615234375, 26, 325, '2M1X', 466, 466.0, 0.0, 0, 708.739013671875, 265.75799560546875, 0, 1, 1.2307699918746948, 0, 54.60309982299805, 60.0, 13, 2, 2.7755598776686065e-17, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12818, 4766, 121, 0.0, 708.739013671875, 265.75799560546875, 325.0, 1, 325, 708.739013671875, 0, 121, 265.75799560546875, 0, 'snp', 1.0)]
[13 13 13 13 13 13 13 13 13 13 12 12 13 13 13 12 12 12 12 12 13 13 13 13 13
 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13]
[2 2 2 1 2 1 1 1 1 3 1 1 2 2 9 2 1 1 1 1 1 2 1 3 1 3 1 1 1 1 2 1 1 3 2 2 1
 2 1 1]
['A' 'CTA' 'C' 'T' 'CGTG' 'A' 'C' 'T' 'A' 'AAAACAAC' 'GAC' 'T' 'A' 'AATAT'
 'TGCGTTCAACGA' 'GT' 'ACGT' 'G' 'G' 'C' 'T' 'CACTAT' 'C' 'ATGAAG' 'C'
 'AATTTATGTATA' 'C' 'T' 'T' 'G' 'CATTCTG' 'T' 'A' 'GATATATATG' 'AGAT'
 'ATCTT' 'T' 'CGAGTTATT' 'T' 'A']
19843

In [200]:

print(c.dtype)
print(len(c))
print(c["GT"][0:10][:,0])
print((c["GT"][0:10000][:,0] == "./.").sum())

print(c["GT"][0:5] != "./.")
print((x != "./.").sum() for x in c["GT"][0:5])

from collections import Counter

loci_coverage = [x.sum() for x in c_filt["GT"].T != "./."]
print(loci_coverage)

(numpy.record, [('is_called', '?'), ('is_phased', '?'), ('genotype', 'i1', (2,)), ('CATG', 'S12'), ('GT', 'S3')])
220118
['./.' './.' './.' './.' './.' './.' './.' './.' '0/0' '0/0']
5811
[[False False False False False False False  True False False False  True
  False]
 [False False False False False False False  True False False False  True
  False]
 [False False False False  True False False False  True False False False
  False]
 [False False False False  True False False False  True False False False
  False]
 [False False False False  True False False False  True False False False
  False]]
<generator object <genexpr> at 0x2aab77fca870>
[96757, 98257, 97393, 95620, 97197, 95823, 98629, 98982, 99257, 99071, 99157, 99736, 99578]

In [599]:

#x = sim_loc_cov
#x = sim_snp_cov
#x = sim_sample_nsnps
x = sim_sample_nlocs
for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]:
    for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]:
        try:
            print(prog+sim+"\t"),
            print(x[prog+sim]),
            print(np.mean(x[prog+sim]))
        except:
            print("No {}".format(prog+sim))
    print("------------------------------------------------------")

ipyrad-simno	[9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870, 9870] 9870.0
pyrad-simno	[9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894, 9894] 9894.0
stacks_ungapped-simno	[9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75
stacks_gapped-simno	[9870, 9862, 9865, 9863, 9862, 9864, 9864, 9867, 9865, 9867, 9863, 9865] 9864.75
stacks_defualt-simno	[9320, 9303, 9253, 9096, 9042, 9021, 8958, 8934, 8846, 8823, 8767, 8687] 9004.16666667
ddocent_filt-simno	[9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0
ddocent_full-simno	[9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852, 9852] 9852.0
aftrrad-simno	No aftrrad-simno
------------------------------------------------------
ipyrad-simlo	[9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859, 9859] 9859.0
pyrad-simlo	[9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884, 9884] 9884.0
stacks_ungapped-simlo	[9819, 9808, 9818, 9812, 9832, 9825, 9826, 9801, 9811, 9808, 9817, 9801] 9814.83333333
stacks_gapped-simlo	[1512, 1503, 1507, 1503, 1507, 1504, 1508, 1508, 1507, 1506, 1508, 1506] 1506.58333333
stacks_defualt-simlo	[9275, 9254, 9207, 9055, 9020, 8993, 8929, 8888, 8807, 8779, 8741, 8645] 8966.08333333
ddocent_filt-simlo	[9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0
ddocent_full-simlo	[9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853, 9853] 9853.0
aftrrad-simlo	No aftrrad-simlo
------------------------------------------------------
ipyrad-simhi	[9844, 9844, 9844, 9844, 9844, 9843, 9844, 9844, 9844, 9844, 9844, 9844] 9843.91666667
pyrad-simhi	[9877, 9877, 9877, 9877, 9877, 9876, 9877, 9877, 9877, 9877, 9877, 9877] 9876.91666667
stacks_ungapped-simhi	[9741, 9769, 9746, 9746, 9756, 9747, 9756, 9745, 9727, 9731, 9717, 9715] 9741.33333333
stacks_gapped-simhi	[60, 60, 59, 60, 60, 60, 60, 60, 60, 60, 59, 58] 59.6666666667
stacks_defualt-simhi	[9187, 9208, 9138, 8985, 8955, 8925, 8878, 8842, 8737, 8716, 8648, 8582] 8900.08333333
ddocent_filt-simhi	[9848, 9848, 9848, 9848, 9848, 9848, 9846, 9847, 9847, 9848, 9848, 9848] 9847.66666667
ddocent_full-simhi	[9848, 9848, 9848, 9850, 9850, 9850, 9848, 9848, 9849, 9850, 9850, 9850] 9849.08333333
aftrrad-simhi	No aftrrad-simhi
------------------------------------------------------
ipyrad-simlarge	[95399, 95428, 95405, 95456, 95450, 95412, 95437, 95418, 95283, 95294, 95288, 95302] 95381.0
pyrad-simlarge	[95646, 95675, 95649, 95708, 95695, 95659, 95686, 95672, 95543, 95552, 95544, 95558] 95632.25
stacks_ungapped-simlarge	[95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5
stacks_gapped-simlarge	[95366, 95401, 95391, 95400, 95394, 95367, 95411, 95405, 95232, 95276, 95268, 95271] 95348.5
stacks_defualt-simlarge	[90013, 89989, 89284, 87596, 87548, 87334, 86818, 86236, 85427, 85225, 84641, 83680] 86982.5833333
ddocent_filt-simlarge	[90677, 90706, 90329, 89908, 90701, 90653, 90336, 89898, 90677, 90684, 90271, 89800] 90386.6666667
ddocent_full-simlarge	[95169, 95200, 95167, 95218, 95203, 95164, 95198, 95177, 95048, 95056, 95049, 95052] 95141.75
aftrrad-simlarge	No aftrrad-simlarge
------------------------------------------------------

In [338]:

if True:
    keys   = ["e", "b", "b", "c", "d", "e", "e", 'a']
    values = [1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1]

    print('two methods of splitting by group')
    print('as list')
    for k,v in zip(*npi.group_by(keys)(values)):
        print(k, v)
    print('as iterable')
    g = npi.group_by(keys)
    for k,v in zip(g.unique, g.split_sequence_as_iterable(values)):
        print(k, list(v))
    print('iterable as iterable')
    for k, v in zip(g.unique, g.split_iterable_as_iterable(values)):
        print(k, list(v))

two methods of splitting by group
as list
('a', array([ 1.]))
('b', array([ 4.5,  4.3]))
('c', array([ 2.]))
('d', array([ 5.67]))
('e', array([ 1.2 ,  8.08,  9.01]))
as iterable
[1 2 1 1 3]
('a', [1])
('b', [4.5, 4.3])
('c', [2.0])
('d', [5.67])
('e', [1.2, 8.08, 9.01])
iterable as iterable
('a', [1])
('b', [4.5, 4.3])
('c', [2.0])
('d', [5.67])
('e', [1.2, 8.08, 9.01])