#!/usr/bin/env python # coding: utf-8 # # Results for ipyrad/pyrad/stacks/aftrRAD/dDocent simulated & emprical analyses # In[8]: ## Prereqs for plotting and analysis #!conda install matplotlib #!conda install libgcc # for vcfnp #!pip install vcfnp #!pip install scikit-allel #!conda install -y seaborn #!pip install toyplot #!pip install numpy_indexed # In[1]: ## Imports and working/output directories directories import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams["figure.figsize"] = [12,9] from collections import Counter import seaborn as sns sns.set_style('white') sns.set_style('ticks') import toyplot import toyplot.html ## toypot sublib for saving html plots import pandas as pd import numpy as np import collections import allel import vcfnp import shutil import glob import os from allel.util import * ## for ensure_square() ## Set the default directories for exec and data. WORK_DIR="/home/iovercast/manuscript-analysis/" EMPERICAL_DATA_DIR=os.path.join(WORK_DIR, "example_empirical_rad/") IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/") PYRAD_DIR=os.path.join(WORK_DIR, "pyrad/") STACKS_DIR=os.path.join(WORK_DIR, "stacks/") AFTRRAD_DIR=os.path.join(WORK_DIR, "aftrRAD/") DDOCENT_DIR=os.path.join(WORK_DIR, "dDocent/") ## tmp for testing #IPYRAD_OUTPUT = "./arch/ipyrad/REALDATA/" #PYRAD_OUTPUT = "./arch/pyrad/REALDATA/" os.chdir(WORK_DIR) # ## Get sample names, species names, and the species_dict # Next will do some housekeeping to get our empirical samples sorted into species. # In[2]: ## Get sample names and assign them to a species dict IPYRAD_STATS=os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA_stats.txt") infile = open(IPYRAD_STATS).readlines() sample_names = [x.strip().split()[0] for x in infile[20:33]] species = set([x.split("_")[1] for x in sample_names]) species_dict = collections.OrderedDict([]) ## Ordered dict of sample names and their species, in same order ## as the vcf file for s in sample_names: species_dict[s] = s.split("_")[1] print(species_dict) ## Map species names to groups of individuals #for s in species: # species_dict[s] = [x for x in sample_names if s in x] #print(species_dict) species_colors = { 'rex': '#FF0000', 'cyathophylla': '#008000', 'thamno': '#00FFFF', 'cyathophylloides': '#90EE90', 'przewalskii': '#FFA500', 'superba': '#8B0000', } # ## Function for plotting PCA given an input vcf file # In[4]: def plotPCA(call_data, title): c = call_data g = allel.GenotypeArray(c.genotype) ac = g.count_alleles() ## Filter singletons and multi-allelic snps flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) gf = g.compress(flt, axis=0) gn = gf.to_n_alt() coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson') fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(1, 1, 1) sns.despine(ax=ax, offset=5) x = coords1[:, 0] y = coords1[:, 1] ## We know this works because the species_dict and the columns in the vcf ## are in the same order. for sp in species: flt = (np.array(species_dict.values()) == sp) ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5) ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100)) ax.legend(bbox_to_anchor=(1, 1), loc='upper left') fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold') fig.tight_layout() def getPCA(call_data): c = call_data g = allel.GenotypeArray(c.genotype) ac = g.count_alleles() ## Filter singletons and multi-allelic snps flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) gf = g.compress(flt, axis=0) gn = gf.to_n_alt() coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson') return coords1, model1 """ fig = plt.figure(figsize=(5, 5)) x = coords1[:, 0] y = coords1[:, 1] ## We know this works because the species_dict and the columns in the vcf ## are in the same order. # for sp in species: # flt = (np.array(species_dict.values()) == sp) # ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5) ax.plot(x, y, marker='o', markersize=10) ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100)) """ def plotPairwiseDistance(call_data, title): c = call_data #c = vcfnp.calldata_2d(filename).view(np.recarray) g = allel.GenotypeArray(c.genotype) gn = g.to_n_alt() dist = allel.stats.pairwise_distance(gn, metric='euclidean') allel.plot.pairwise_distance(dist, labels=species_dict.keys()) def getDistances(call_data): c = call_data #c = vcfnp.calldata_2d(filename).view(np.recarray) g = allel.GenotypeArray(c.genotype) gn = g.to_n_alt() dist = allel.stats.pairwise_distance(gn, metric='euclidean') return(dist) # ## Function for plotting distribution of variable sites across loci # In[5]: ## Inputs to this function are two Counters where keys ## are the base position and values are snp counts. ## The counter doesn't have to be sorted because we sort internally. def SNP_position_plot(distvar, distpis): ## The last position to consider maxend = max(distvar.keys()) ## This does two things, first it sorts in increasing ## order. Second, it creates a count bin for any position ## without snps and sets the count to 0. distvar = [distvar[x] for x in xrange(maxend)] distpis = [distpis[x] for x in xrange(maxend)] ## set color theme colormap = toyplot.color.Palette() ## make a canvas canvas = toyplot.Canvas(width=800, height=300) ## make axes axes = canvas.cartesian(xlabel="Position along RAD loci", ylabel="N variables sites", gutter=65) ## x-axis axes.x.ticks.show = True axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"} axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"} axes.x.ticks.below = 5 axes.x.ticks.above = 0 axes.x.domain.max = maxend axes.x.ticks.locator = toyplot.locator.Explicit( range(0, maxend, 5), map(str, range(0, maxend, 5))) ## y-axis axes.y.ticks.show=True axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"} axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"} axes.y.ticks.below = 0 axes.y.ticks.above = 5 ## add fill plots x = np.arange(0, maxend) f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites") f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites") ## add a horizontal dashed line at the median Nsnps per site axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"}) axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"}) return canvas, axes # ## Functions for polling stats from vcf call data # In[6]: import numpy_indexed as npi ## Get the number of samples with data at each snp def snp_coverage(call_data): snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."]) ## Fill zero values return [snp_counts[x] for x in xrange(1, max(snp_counts.keys())+1)] ## Get the number of samples with data at each locus def loci_coverage(var_data, call_data, assembler): if "stacks" in assembler: loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")) else: loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./.")) counts_per_snp = [] for z in xrange(0, len(loci)): counts_per_snp.append([x.sum() for x in loci[z][1]]) counts = collections.Counter([np.max(x) for x in counts_per_snp]) ## Fill all zero values return [counts[x] for x in xrange(1, max(counts.keys())+1)] ## Get total number of snps per sample def sample_nsnps(call_data): return [x.sum() for x in call_data["GT"].T != "./."] ## Get total number of loci per sample def sample_nloci(var_data, call_data, assembler): if "stacks" in assembler: locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.") else: locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.") by_locus = [x.T for x in locus_groups[1]] by_sample = np.array([(x).any(axis=1) for x in by_locus]) return [x.sum() for x in by_sample.T] # ## End housekeeping. Begin actual analysis of results. # # ## First lets look at results just from the vcf files (so this is only looking at variable loci). # The first thing we'll do is create a dataframe for storing a bunch # of coverage information from the runs for each method. # In[7]: ## Make a new pandas dataframe for holding the coverage results ## this is going to sim_vcf_dict = {} for sim in ["simno", "simlo", "simhi", "simlarge"]: sim_vcf_dict["ipyrad-"+sim] = os.path.join(IPYRAD_DIR, "SIMDATA/{}/{}_outfiles/{}.vcf".format(sim, sim, sim)) sim_vcf_dict["pyrad-"+sim] = os.path.join(PYRAD_DIR, "SIMDATA/{}/outfiles/c85d6m2p3H3N3.vcf".format(sim)) sim_vcf_dict["stacks_ungapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/ungapped/{}/batch_1.vcf".format(sim)) sim_vcf_dict["stacks_gapped-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/gapped/{}/batch_1.vcf".format(sim)) sim_vcf_dict["stacks_defualt-"+sim] = os.path.join(STACKS_DIR, "SIMDATA/default/{}/batch_1.vcf".format(sim)) sim_vcf_dict["aftrrad-"+sim] = os.path.join(AFTRRAD_DIR, "SIMDATA/{}/Formatting/{}.vcf".format(sim, sim)) sim_vcf_dict["ddocent_full-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/TotalRawSNPs.vcf".format(sim, sim)) sim_vcf_dict["ddocent_filt-"+sim] = os.path.join(DDOCENT_DIR, "SIMDATA/{}/{}_fastqs/Final.recode.vcf".format(sim, sim)) # ## Clean up the vcf data # Each program writes out a vcf with some little quirks, so we have to smooth them all out. # gunzip and filter out all non-biallelic loci. You only need to run this cell once, if you are # reanalyzing the simdata. # In[37]: for k, f in sim_vcf_dict.items(): ## If it's gzipped then unzip it (only applies to ipyrad) if os.path.exists(f + ".gz"): print("gunzipping - {}".format(f+".gz")) cmd = "gunzip -c {}.gz > {}".format(f, f) get_ipython().system('$cmd') f = f.split(".gz")[0] sim_vcf_dict[k] = f if os.path.exists(f): print("found - {}".format(f)) if "stacks" in k: ## The version of vcftools we have here doesn't like the newer version of vcf stacks ## writes, so we have to 'fix' the stacks vcf version shutil.copy2(f, f+".bak") with open(f) as infile: dat = infile.readlines()[1:] with open(f, 'w') as outfile: outfile.write("##fileformat=VCFv4.1\n") outfile.write("".join(dat)) ## Actually don't do this, it removes information about the assembly. ## I don't know why i wanted to do this in the first place. Habit? ## try: ## ## Remove all but biallelic. ## outfile = f.split(".vcf")[0]+"-biallelic" ## cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \ ## .format(DDOCENT_DIR, f, outfile) ## #print(cmd) ## os.system(cmd) ## ## ## update the vcf_dict ## sim_vcf_dict[k] = outfile + ".recode.vcf" ## except Exception as inst: ## print(inst) else: print("not found - {}".format(f)) # ### Pull depth and coverage stats out of the vcf files # Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later). # In[38]: import collections sim_loc_cov = collections.OrderedDict() sim_snp_cov = collections.OrderedDict() sim_sample_nsnps = collections.OrderedDict() sim_sample_nlocs = collections.OrderedDict() ## Try just doing them all the same for prog, filename in sim_vcf_dict.items(): try: print("Doing - {}".format(prog)) print("\t{}".format(filename)) v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray) c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray) sim_snp_cov[prog] = snp_coverage(c) sim_sample_nsnps[prog] = sample_nsnps(c) ## aftrRAD doesn't retain the kind of info we'd need for the next two if "aftrrad" in prog: continue sim_loc_cov[prog] = loci_coverage(v, c, prog) sim_sample_nlocs[prog] = sample_nloci(v, c, prog) except Exception as inst: print(inst) # ### Write out the results to a file # In case we need to restart the notebook then we won't have to come back and reload all the data # In[496]: res_dict = {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\ "sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs} res = WORK_DIR + "RESULTS/" if not os.path.exists(res): os.mkdir(res) for k,v in res_dict.items(): df = pd.DataFrame(v) df.to_csv(res + k + ".csv") # ### Reload the data from csv files # If you ever need to do this. # In[497]: for k,v in res_dict.items(): df = pd.read_csv(res + k + ".csv") print(df) # ### Print out the results # This isn't very helpful, but you get an idea of what's going on. Values here for stacks gapped analysis look weird because they are bad. The populations program was segfaulting during creation of the vcf, so it's super truncated. Not sure what the problem is. The results down below the plots pull in stats from other files so they are better. # In[39]: for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\ "sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items(): for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]: for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]: try: print(prog+sim + " " + statname + "\t"), print(stat[prog + sim]), print(np.mean(stat[prog + sim])) except: print("No {} stats for {}".format(statname, prog + sim)) print("------------------------------------------------------") # ## Pairwise difference and PCA plots for each simulation treatment # In[61]: ## Load the calldata into a dict so we don't have to keep loading and reloading it calldata = {} for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]: for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \ "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]: print("{}".format(prog+sim)), print("{}".format(sim_vcf_dict[prog+sim])) c = vcfnp.calldata_2d(sim_vcf_dict[prog+sim], verbose=False).view(np.recarray) calldata[prog+sim] = c # In[617]: pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"] pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"] pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"] sim_sample_names = pop1 + pop2 + pop3 flt = np.in1d(np.array(sim_sample_names), pop1) print(flt) # ### Plot PCAs for each assembler and each simulated datatype # In[619]: ## Some housekeeping with sample names to make the PCA plots prettier pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"] pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"] pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"] pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3} pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"} sim_sample_names = pop1 + pop2 + pop3 for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]: #for sim in ["-simhi"]: print("Doing - {}".format(sim)) f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000) axarr = [a for b in axarr for a in b] for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \ "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr): ## Annoying debug messages #print("{}".format(prog+sim)), #print("{}".format(sim_vcf_dict[prog+sim])) coords1, model1 = getPCA(calldata[prog+sim]) x = coords1[:, 0] y = coords1[:, 1] ax.scatter(x, y, marker='o') ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100)) for pop in pops.keys(): flt = np.in1d(np.array(sim_sample_names), pops[pop]) ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5) ax.set_title(prog+sim, style="italic") ax.axison = True f.tight_layout() # ### Plot pairwise distances for each assembler and each simulated datatype # In[65]: for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]: #for sim in ["-simhi"]: print("Doing - {}".format(sim)) f, axarr = plt.subplots(2, 4, figsize=(16,8), dpi=1000) axarr = [a for b in axarr for a in b] for prog, ax in zip(["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", \ "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"], axarr): print("{}".format(prog+sim)), print("{}".format(sim_vcf_dict[prog+sim])) ## Calculate pairwise distances dist = getDistances(calldata[prog+sim]) ## Doing it this way works, but allel uses imshow internally which rasterizes the image #allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False) ## Create the pcolormesh by hand dat = ensure_square(dist) ## for some reason np.flipud(dat) is chopping off one row of data p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\ cmap="jet", vmin=np.min(dist), vmax=np.max(dist)) ## Clip all heatmaps to actual sample size p.axes.axis("tight") ax.set_title(prog+sim, style="italic") ax.axison = False # # Go through and pull in more fine grained results # What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them *_full_* to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample. # # In[71]: sim_full_loc_cov = collections.OrderedDict() sim_full_sample_nlocs = collections.OrderedDict() ## Try just doing them all the same for prog, filename in sim_vcf_dict.items(): print("Doing - {}\t".format(prog)), sim_full_loc_cov[prog] = [] sim_full_sample_nlocs[prog] = [] # ## ipyrad simulated results # In[407]: ## ipyrad stats IPYRAD_SIMOUT=os.path.join(IPYRAD_DIR, "SIMDATA/") for sim in ["simno", "simlo", "simhi", "simlarge"]: print("Doing - {}\t".format(sim)), simstring = "ipyrad-" + sim simdir = os.path.join(IPYRAD_SIMOUT, sim) statsfile = simdir + "/{}_outfiles/{}_stats.txt".format(sim, sim) infile = open(statsfile).readlines() sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]] #print(sample_coverage) print("mean sample coverage - {}\t".format(np.mean(sample_coverage))), print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage))) sim_full_sample_nlocs[simstring] = sample_coverage nmissing = [int(x.strip().split()[1]) for x in infile[38:50]] sim_full_loc_cov[simstring] = nmissing ## Just look at the ones we care about for ipyrad print([(x,y) for x,y in sim_full_loc_cov.items() if "ipyrad" in x]) print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ipyrad" in x]) # ## pyrad simulated results # In[406]: ## ipyrad stats PYRAD_SIMOUT=os.path.join(PYRAD_DIR, "SIMDATA/") for sim in ["simno", "simlo", "simhi", "simlarge"]: simstring = "pyrad-"+sim print("Doing - {}\t".format(sim)), simdir = os.path.join(PYRAD_SIMOUT, sim) statsfile = simdir + "/stats/c85d6m2p3H3N3.stats" infile = open(statsfile).readlines() sample_coverage = [int(x.strip().split()[1]) for x in infile[8:20]] print("mean sample coverage - {}\t".format(np.mean(sample_coverage))), print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage))) sim_full_sample_nlocs[simstring] = sample_coverage nmissing = [0] + [int(x.strip().split()[1]) for x in infile[26:37]] sim_full_loc_cov[simstring] = nmissing ## Just look at the ones we care about for pyrad print([(x,y) for x,y in sim_full_loc_cov.items() if "pyrad" in x]) print([(x,y) for x,y in sim_full_sample_nlocs.items() if "pyrad" in x]) # ## stacks simulated results # In[111]: import gzip ## stacks stats STACKS_SIMOUT=os.path.join(STACKS_DIR, "SIMDATA/") STACKS_GAP_SIMOUT=os.path.join(STACKS_SIMOUT, "gapped/") STACKS_UNGAP_SIMOUT=os.path.join(STACKS_SIMOUT, "ungapped/") STACKS_DEFAULT_SIMOUT=os.path.join(STACKS_SIMOUT, "default/") for dir in [STACKS_GAP_SIMOUT, STACKS_UNGAP_SIMOUT, STACKS_DEFAULT_SIMOUT]: stacks_method = dir.split("/")[-2] for sim in ["simno", "simlo", "simhi", "simlarge"]: #print("Doing - {}-{}".format(stacks_method, sim)), simstring = "stacks_"+stacks_method+"-"+sim try: simdir = os.path.join(dir, sim) lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines() cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] sim_full_loc_cov[simstring] = [cnts.count(i) for i in range(1,13)] except Exception as inst: print("loc_cov - {} - {}".format(inst, simdir)) try: sim_full_sample_nlocs[simstring] = [] samp_haps = glob.glob("{}/*matches*".format(simdir)) for f in samp_haps: lines = gzip.open(f).readlines() sim_full_sample_nlocs[simstring].append(len(lines) - 1) except Exception as inst: print("sample_nlocs - {} - {}".format(inst, simdir)) ## Just look at the stacks results to reduce the clutter print([(x,y) for x,y in sim_full_loc_cov.items() if "stacks" in x]) print([(x,y) for x,y in sim_full_sample_nlocs.items() if "stacks" in x]) # ## AftrRAD simulated results # AftrRAD doesn't natively support vcf. the vcf files we made # are in sim\*/Formatting/sim\*.vcf # # It's not straightforward how to get the locations for snps # from the output files. aftrRAD does calculate and write these out # as part of the Genotype.pl phase to a file called __/TempFiles/SNPLocationsToPlot.txt__ # # It's also not straightforward how to get the number of loci recovered for each invidual. # You have to total up the counts from two different files for each individual. The # `Outputs/Genotypes` directory has two files, one for `Haplotypes` and one for `Monomorphics`. # The haplotypes file is a giant matrix of concatenated snps per locus so you have to # search through each column to count up all the `NA`'s (missing data per locus per individual). # Then you have to count up the number of non-zero monomorphic sites. # In[192]: AFTRRAD_SIMOUT=os.path.join(AFTRRAD_DIR, "SIMDATA/") for sim in ["simno", "simlo", "simhi", "simlarge"]: #for sim in ["simlo"]: simstring = "aftrrad-{}".format(sim) print("Doing - {}".format(simstring)) simdir = os.path.join(AFTRRAD_SIMOUT, sim) ## The `17` in the file name is the min % of samples w/ data hapsfile = simdir + "/Output/Genotypes/Haplotypes_17.All.txt" monofile = simdir + "/Output/Genotypes/Monomorphics_17.txt" ## Set low_memory=False or else simlarge crashes pandas mono_df = pd.read_csv(monofile, delim_whitespace=True, header=0, low_memory=False) haps_df = pd.read_csv(hapsfile, delim_whitespace=True, header=0, index_col=0, low_memory=False) sample_coverage = {} ## Get the list of all the simulated sample names samplenames = mono_df.columns.values[1:] ## Do haplotypes file first because it is phased so there are 2 rows per sample ## so we end up double counting NA's. If you read the # of monomorphics first ## then you have to figure out some way to prevent the double counting, here ## we just count twice and overwrite. for row in haps_df.itertuples(): sample_coverage[row[0].split("Individual")[1]] = len(pd.Series(row).nonzero()[0]) for sname in samplenames: ## nonzero returns a tuple of which we are only interested in the first element sample_coverage[sname] += len(mono_df[sname].nonzero()[0]) ## To sort or not to sort the coverage_df? print("mean sample coverage - {}".format(np.mean(sample_coverage.values()))), print("min/max - {}/{}".format(np.min(sample_coverage.values()), np.max(sample_coverage.values()))) ## Put them in the dict sorted by sample name sim_full_sample_nlocs[simstring] = [sample_coverage[x] for x in sorted(sample_coverage.keys())] ## Now do locus coverage ## For the same reason as above, do the haplotypes file first. We create the tmp_counts ## counter and then make a new counter with the index being 1/2 to account for the ## doublecounting of NaN in the haplotypes file. ## Also, note that annoyingly pandas reads in "NA" as float('nan') rather than string "NA" ## which took me a _while_ to figure out. hap_nonzero = collections.Counter(haps_df.count()) tmp_counts = collections.Counter(hap_nonzero) hap_counts = collections.Counter() for x in tmp_counts: hap_counts.update({x/2:tmp_counts[x]}) ## Get monomorphics ## Get the sum of non-zero elements per row (the fancy .ix slicing drops the ## first column which is the sequence) mono_nonzero = (mono_df.ix[:, 1:] != 0).astype(int).sum(axis=1) mono_counts = collections.Counter(mono_nonzero) tot_counts = hap_counts + mono_counts ## Collections are unordered, and also any coverage level with no count ## will not be in the collection, so we have to order it and insert zeros at the appropriate locations dat = [] for i in xrange(1,13): try: dat.append(tot_counts[i]) except: dat.append(0) sim_full_loc_cov[simstring] = dat print("Locus coverage") print([(x,y) for x,y in sim_full_loc_cov.items() if "aftrrad" in x]) print("Sample nloci") print([(x,y) for x,y in sim_full_sample_nlocs.items() if "aftrrad" in x]) # ## dDocent simulated results # ~~Here's what's weird. The dDocent reference.fasta file which is generated by the cd-hit clustering looks like it contains the right number of clusters. Similarly, if you use samtools to view the sample `-RG.bam`, and the `mapped.bed` file has 10000 loci as well. But if you look at the Contigs that make it to the final vcf files (either TotalRawSNPs or Final.recode) there are ~100+ contigs missing. I'm sure freebayes is filtering loci, but i diff'd the loci in the bed file and the vcf, and looked at the sequences of one of the missing loci in the cat-RRG.bam file and they look more or less normal. It's not the mapping quality flaq (-m), because all sim seqs have high quality mapping. None of the other settings in the dDocent script or the default freebayes settings seem particularly conspicuous.~~ Monomorphic loci obviously aren't going to end up in the final vcf. It is incredibly not straightforward how to actually get access to the monomorphic loci from the intermediary files that dDocent creates. The shortcuts that dDocent takes to run super fast mean it's really hard to get "stacks" of sequence data per locus (e.g. the ipyrad .loci file). # # grep Contig TotalRawSNPs.vcf | cut -f 1 | uniq -c | wc -l # # simno/simlo/simhi - 9887/9889/9890 # # For simlarge there was a much bigger difference between the TotalRaw and Final.recode vcf files (98350/91620), TotalRaw recovers about 98.4% of true loci, but the final recode catches only about 92%. # # It's also not super straightforward how to get # loci per sample from the dDocent output, so # I had to gin something up. Right now because it uses samtools to view the bam file and then does # some shell manipulation, this is a little slow (20 seconds per simulation treatment), but tolerable. # # TODO: This is currently not tracking __which__ samples correspond with the number of loci recovered # bcz the ddocent bam files use the weird naming scheme and I haven't back calculated the good names yet. # # This isn't exactly technically correct, because this is the counts of loci mapped for each individual, # not the number of loci for each individual in the output! # # Also, this is a little slow. Takes about 30-40 minutes. # In[ ]: import subprocess DDOCENT_SIMOUT=os.path.join(DDOCENT_DIR, "SIMDATA/") for sim in ["simno", "simlo", "simhi", "simlarge"]: #for sim in ["simlo"]: simdir = os.path.join(DDOCENT_SIMOUT, sim + "/" + sim + "_fastqs/") print("Doing {} - {}".format(sim, simdir)) ## This is hackish. You have to dip into the bam file to get locus counts that ## include counts for monomorphics. Note! that field 3 of the bam file is not ## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3') ## the read maps to. So really this is kind of by proxy counting the number of loci. sample_coverage = [] for samp in glob.glob(simdir + "/*-RG.bam"): cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp) res = subprocess.check_output(cmd, shell=True) sample_coverage.append(int(res.strip())) print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.") print("mean sample coverage - {}".format(np.mean(sample_coverage))) print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage))) print("Reading VCF") vcf_filt = pd.read_csv("{}/Final.recode.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0) vcf_full = pd.read_csv("{}/TotalRawSNPs.vcf".format(simdir), delim_whitespace=True, header=60, index_col=0) for vcf_string, vcf_df in zip(["full", "filt"], [vcf_full, vcf_filt]): simstring = "ddocent_" + vcf_string + "-" + sim print("Doing {}".format(simstring)) ## Sample coverage is the same for both conditions here, because they are based on the ## same raw data. idxs = set(vcf_df.index) ## Here we are going to simultaneously accumulate information about nloci per sample and locus coverage counts. c = [] sample_cov_counter = collections.Counter() for i, idx in enumerate(idxs): ## Poor man's progress bar. For the impatient. if not i % 1000: print(i), ## Complicated song and dance here because if the locus only has one snp you have to do ## some bullshit to reshape the dataframe. tmp_df = vcf_df.loc[idx] if isinstance(tmp_df, pd.Series): tmp_df = tmp_df.to_frame().T ## The double apply is applying split() to all rows and columns to get the genotype data nonzero_samps = (tmp_df.iloc[:, 8:20].apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").\ astype(int).sum().nonzero()[0] sample_cov_counter.update(nonzero_samps) c.append(len(nonzero_samps)) ## The double apply is applying split() to all rows and columns to get the genotype data # count = len((tmp_df.iloc[:, 8:20].\ # apply(lambda x: x.apply(lambda y: y.split(":")[0])) != "./.").astype(int).sum().nonzero()[0]) # c.append(count) counts = collections.Counter(c) ## Fill in zero values that aren't in the locus counter dat = [] for i in xrange(1,13): try: dat.append(counts[i]) except: dat.append(0) sim_full_loc_cov[simstring] = dat sim_full_sample_nlocs[simstring] = sample_cov_counter.values() # Here's a different stupid way to do this... # # sample_counts = {} # sample_names = list(vcf.columns)[8:] # print(sample_names) # print("num loci - {}".format(len(set(vcf.index)))) # for name in sample_names: # print(name) # sample_counts[name] = 0 # for locus in set(vcf.index): # snps = vcf[name][locus] # if any(map(lambda x: x.split(":")[0] != "./.", snps)): # sample_counts[name] += 1 # else: # pass # #print("{} {} {}".format(name, locus, snps)) # print(sample_counts) print("Locus coverage") print([(x,y) for x,y in sim_full_loc_cov.items() if "ddocent" in x]) print("Sample nloci") print([(x,y) for x,y in sim_full_sample_nlocs.items() if "ddocent" in x]) # ### Write out results of the simulations # This data is worth its weight in gold, it takes forever to acquire, # so lets save it off to disk so we don't lose it. # In[502]: import pickle sim_res_dict = {"sim_sample_nlocs_df":sim_full_sample_nlocs, "sim_full_loc_cov":sim_full_loc_cov} for k,v in sim_res_dict.items(): print(k) pickle.dump(v, open(WORK_DIR + "RESULTS/" + k + ".p", 'wb')) # ## Plotting simulation results # In[443]: simlevel = ["simno", "simlo", "simhi", "simlarge"] assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\ "stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"] sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel) for sim in simlevel: for ass in assemblers: simstring = "-".join([ass, sim]) sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring]) print("Mean number of loci recovered per sample.") ## Normalize all bins dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float) for sim in simlevel: scale = 10000 if sim == "simlarge": scale = 100000 dat[sim] = dat[sim]/scale sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True) print(sim_sample_nlocs_df) #print(dat) # ### Ugly plot for locus coverage. # In[552]: simlevel = ["simno", "simlo", "simhi", "simlarge"] assemblers = ["ipyrad", "pyrad", "stacks_gapped", "stacks_ungapped",\ "stacks_default", "aftrrad", "ddocent_full", "ddocent_filt"] df = pd.DataFrame(index=assemblers, columns=simlevel) df2 = pd.DataFrame(columns = ["assembler", "simtype"] + [x for x in xrange(1,13)]) df3 = pd.DataFrame(columns = ["assembler", "simtype", "simdata"]) ## Try to get a dataframe in the shape seaborn will be happy with dfsns = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"]) for sim in simlevel: for ass in assemblers: simstring = ass + "-" + sim df[sim][ass] = np.array(sim_full_loc_cov[simstring]) df2.loc[simstring] = [ass, sim] + sim_full_loc_cov[simstring] df3.loc[simstring] = [ass, sim, np.array(sim_full_loc_cov[simstring])] for i, val in enumerate(sim_full_loc_cov[simstring]): ## Normalize values so different sim sizes print the same max = 10000. if "large" in simstring: max = 100000. dfsns.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val/max] sns.set(style="white") ## This kinda sucks, too spread out. #g = sns.FacetGrid(dfsns, row="assembler", col="simtype", margin_titles=True) #g.map(sns.barplot, "bin", "simdata", color="steelblue", lw=0) plt.rcParams['figure.figsize']=(20,20) g = sns.FacetGrid(dfsns, col="simtype", margin_titles=True, col_wrap=2, size=4, aspect=2) g.map(sns.barplot, "bin", "simdata", "assembler", lw=0.5, palette="bright").add_legend() # ### Another ugly plot for locus coverage. # In[553]: ## Barplots of assembly method x simdata type g = sns.FacetGrid(dfsns, col="assembler", margin_titles=True, col_wrap=4, size=3, aspect=1.5) g.map(sns.barplot, "bin", "simdata", "simtype", lw=0.5, palette="bright").add_legend() # ### Much better! Do the spline plot version of the above plot # It just looks nicer, makes more sense. # In[598]: dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"]) for sim in simlevel: for ass in assemblers: simstring = ass + "-" + sim ## Normalize values so different sim sizes print the same max = 10000. if "large" in simstring: max = 100000. newdat = sim_full_loc_cov[simstring] newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)] for i, val in enumerate(newdat): dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val] g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", sharex=True, sharey=False, size=4, col_wrap=2) g.map(plt.scatter, "bin", "simdata") g.map(plt.plot, "bin", "simdata").add_legend() axs = g.axes for ax in axs: ax.set_xlim(0,12.5) # # Empirical Results (Pedicularis) # ## Process the vcf output from all the runs # Here we'll pull together all the output vcf files and filter # out everything except biallelic snps. This takes a few minutes to run. # In[603]: vcf_dict = {} vcf_dict["ipyrad"] = os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA.vcf") vcf_dict["pyrad"] = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf") vcf_dict["stacks_ungapped"] = os.path.join(STACKS_DIR, "REALDATA/ungapped/batch_1.vcf") vcf_dict["stacks_gapped"] = os.path.join(STACKS_DIR, "REALDATA/gapped/batch_1.vcf") vcf_dict["stacks_default"] = os.path.join(STACKS_DIR, "REALDATA/default/batch_1.vcf") vcf_dict["aftrrad"] = os.path.join(AFTRRAD_DIR, "REALDATA/Formatting/REALDATA.vcf") vcf_dict["ddocent_full"] = os.path.join(DDOCENT_DIR, "REALDATA/TotalRawSNPs.vcf") vcf_dict["ddocent_filt"] = os.path.join(DDOCENT_DIR, "REALDATA/Final.recode.vcf") for k, f in vcf_dict.items(): if os.path.exists(f): print("found - {}".format(f)) ## If it's gzipped then unzip it (only applies to ipyrad i think) if ".gz" in f: print("gunzipping") cmd = "gunzip {}".format(f) get_ipython().system('cmd') vcf_dict[k] = f.split(".gz")[0] ## Remove all but biallelic (for ipyrad this also removes all the monomorphic) #outfile = f.split(".vcf")[0]+"-biallelic" #cmd = "{}vcftools --vcf {} --min-alleles 2 --max-alleles 2 --recode --out {}" \ # .format(DDOCENT_DIR, f, outfile) #print(cmd) #!$cmd ## update the vcf_dict #vcf_dict[k] = outfile + ".recode.vcf" else: print("not found - {}".format(f)) # ## Read in vcf for each analysis and pull in coverage/depth stats # In[604]: emp_loc_cov = collections.OrderedDict() emp_snp_cov = collections.OrderedDict() emp_sample_nsnps = collections.OrderedDict() emp_sample_nlocs = collections.OrderedDict() ## Try just doing them all the same for prog, filename in vcf_dict.items(): try: print("Doing - {}".format(prog)) print("\t{}".format(filename)) v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) emp_loc_cov[prog] = loci_coverage(v, c, prog) emp_snp_cov[prog] = snp_coverage(c) emp_sample_nsnps[prog] = sample_nsnps(c) emp_sample_nlocs[prog] = sample_nloci(v, c, prog) plotPCA(c, prog) plotPairwiseDistance(c, prog) except Exception as inst: print(inst) # In[608]: for i in emp_sample_nlocs: print(i), print(emp_sample_nlocs[i]) # ## ipyrad Empirical results # First we load in the variant info and call data for all the snps # In[70]: IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/") IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt") infile = open(IPYRAD_STATS).readlines() sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]] print(sample_coverage) print("mean sample coverage - {}".format(np.mean(sample_coverage))) print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage))) # Read the biallelic vcf # In[71]: filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf") # filename = vcf_dict["ipyrad"] v = vcfnp.variants(filename).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) # ### Distribution of snps along loci # Getting variable sites and parsimony informative sites from the vcf is kind of annoying # because all the programs export __slightly__ different formats, so you need to # parse them in slightly different ways. There's a better way to do this for ipyrad # but i figure i'll do it the same way for all of them so it's more clear what's happening. # In[72]: ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x) for x in pis]) #distpis = [x for x in sorted(counts.items())] ## Getting the distvar is easier distvar = Counter([int(x) for x in v.POS]) #distvar = [x for x in sorted(counts.items())] canvas, axes = SNP_position_plot(distvar, distpis) ## save fig #toyplot.html.render(canvas, 'snp_positions.html') canvas # In[113]: ## Have to gunzip the ipyrad REALDATA vcf plotPCA(c, "ipyrad") # In[114]: plotPairwiseDistance(c, "pyrad") # ## pyRAD empirical results # # #### TODO: Figure out why vcfnp is failing to read in pyrad vcf # In[206]: PYRAD_EMPIRICAL_OUTPUT=os.path.join(PYRAD_DIR, "REALDATA/") PYRAD_STATS = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "stats/c85d6m2p3H3N3.stats") infile = open(PYRAD_STATS).readlines() sample_coverage = [int(x.strip().split()[1]) for x in infile[8:21]] print(sample_coverage) print("mean sample coverage - {}".format(np.mean(sample_coverage))) print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage))) # ### Pull in the pyrad vcf # Be careful because sometimes the pyrad vcf gets a newline inserted between the last # line of metadata and the first line of genotypes, which causes vcfnp to silently fail. # Also there are a couple flags you can pass to vcfnp for debugging, but the print # hella data to the console. # # v = vcfnp.variants(filename, progress=1, verbose=True).view(np.recarray) # In[230]: filename = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf") v = vcfnp.variants(filename).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) # ### Distribution of snps along loci # In[231]: ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x) for x in pis]) #distpis = [x for x in sorted(counts.items())] ## Getting the distvar is easier distvar = Counter([int(x) for x in v.POS]) #distvar = [x for x in sorted(counts.items())] canvas, axes = SNP_position_plot(distvar, distpis) ## save fig #toyplot.html.render(canvas, 'snp_positions.html') canvas # ## stacks empirical results (ungapped) # In[12]: STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/") STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/") STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/") STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/") #lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines() #cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] #shigh = [cnts.count(i) for i in range(1,13)] # In[13]: filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf") v = vcfnp.variants(filename).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) # ### Distribution of snps along loci # In[14]: ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x.split("_")[1]) for x in pis]) #distpis = [x for x in sorted(counts.items())] ## Getting the distvar is easier distvar = Counter([int(x.split("_")[1]) for x in v.ID]) #distvar = [x for x in sorted(counts.items())] canvas, axes = SNP_position_plot(distvar, distpis) ## save fig #toyplot.html.render(canvas, 'snp_positions.html') canvas # In[15]: plotPCA(c, "stacks ungapped") # In[16]: plotPairwiseDistance(c, "stacks ungapped") # ## stacks empirical results (defaults) # In[16]: filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf") v = vcfnp.variants(filename).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) # ### Some informative stats # TODO # ### Distribution of snps along loci # In[17]: ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x.split("_")[1]) for x in pis]) #distpis = [x for x in sorted(counts.items())] ## Getting the distvar is easier distvar = Counter([int(x.split("_")[1]) for x in v.ID]) #distvar = [x for x in sorted(counts.items())] canvas, axes = SNP_position_plot(distvar, distpis) ## save fig toyplot.html.render(canvas, 'snp_positions.html') canvas # In[18]: plotPCA(c, "stacks default") # In[12]: plotPairwiseDistance(c, "stacks default") # ## AftrRAD empirical results # In[ ]: AFTRRAD_OUTPUT=os.path.join(AFTRRAD_DIR, "REALDATA/") # In[ ]: filename = os.path.join(STACKS_DEFAULT_OUT, "batch_1.vcf") v = vcfnp.variants(filename).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) # ### Distribution of snps along loci # NOT TESTED # # It's not very straightforward how to extract information about # whether each snp is informative or not, given the output files # that aftrRAD generates. It does create a file with all the snp # locations so we can use this to generate the distribution of # variable sites, but the pis will be empty. # In[184]: f = os.path.join(AFTRRAD_OUTPUT, "TempFiles/SNPLocationsToPlot.txt") with open(f) as infile: snplocs = infile.read().split() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([]) ## Getting the distvar is easier distvar = Counter(map(int, snplocs)) canvas, axes = SNP_position_plot(distvar, distpis) ## save fig ## toyplot.html.render(canvas, 'snp_positions.html') canvas # ## dDocent empirical results # HAVE TO MAKE SURE THE SAMPLES IN THE DDOCENT OUTPUT VCF ARE IN THE SAME ORDER AS THOSE IN THE IPYRAD/STACKS # There is now code in the horserace notebook to rename samples in the ddocent # vcf and to reorder the samples to match ipyrad/stacks. # In[ ]: import subprocess DDOCENT_OUTPUT=os.path.join(DDOCENT_DIR, "REALDATA/") os.chdir(DDOCENT_OUTPUT) print("Getting max loci per sample from bam files.") sample_coverage = [] for samp in glob.glob("*-RG.bam"): cmd = "{}samtools view {} | cut -f 3 | uniq | wc -l".format(DDOCENT_DIR, samp) res = subprocess.check_output(cmd, shell=True) print("nloci {} - {}".format(res.strip(), samp)) sample_coverage.append(int(res.strip())) print(sorted(sample_coverage, reverse=True)) print("mean sample coverage - {}".format(np.mean(sample_coverage))) print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage))) #sim_coverage_df["ddocent_"+sim] = sample_coverage vcf_filt = pd.read_csv("Final.recode.vcf", delim_whitespace=True, header=60, index_col=0) vcf_full = pd.read_csv("TotalRawSNPs.vcf", delim_whitespace=True, header=60, index_col=0) ## There's gotta be a faster way to do this... ## It's really fuckin slow on real data, like hours. for vcf in [vcf_full, vcf_filt]: sample_counts = {} sample_names = list(vcf.columns)[8:] print(sample_names) print("num loci in vcf - {}".format(len(set(vcf.index)))) for name in sample_names: print(name) sample_counts[name] = 0 for locus in set(vcf.index): snps = vcf[name][locus] if any(map(lambda x: x.split(":")[0] != "./.", snps)): sample_counts[name] += 1 else: pass print(sample_counts) print(sim_coverage_df) # In[171]: ## Basic difference btw Final.recode and TotalRawSNPs is that final recode ## only includes individuals with snps called in > 90% of samples f1 = os.path.join(DDOCENT_OUTPUT, "TotalRawSNPs.vcf") v_full = vcfnp.variants(f1, dtypes={"CHROM":"a24"}).view(np.recarray) c_full = vcfnp.calldata_2d(f1).view(np.recarray) f2 = os.path.join(DDOCENT_OUTPUT, "Final.recode.vcf") v_filt = vcfnp.variants(f2, dtypes={"CHROM":"a24"}).view(np.recarray) c_filt = vcfnp.calldata_2d(f2).view(np.recarray) # ### Distribution of snps along loci # In[75]: for dset in [[v_full, c_full], [v_filt, c_filt]]: ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), dset[1]["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(dset[0]["POS"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x) for x in pis]) ## Getting the distvar is easier distvar = Counter([int(x) for x in v_filt.POS]) canvas, axes = SNP_position_plot(distvar, distpis) ## save fig ## toyplot.html.render(canvas, 'snp_positions.html') canvas # In[82]: #Tmp delete me print(np.sum(distvar.values())) print(np.sum(distpis.values())) # In[238]: for dset in [[v_full, c_full], [v_filt, c_filt]]: plotPCA(dset[1], "ddocent") # In[239]: plotPairwiseDistance(c_full, "dDocent Full") plotPairwiseDistance(c_filt, "dDocent Filtered") # # Everything Below here is crap! # In[584]: ## Testing for different vcf formats prog = "stacks-simhi" filename = "/home/iovercast/manuscript-analysis/stacks/SIMDATA/ungapped/simhi/batch_1.vcf" #prog = "dDocent-simhi" #filename = "/home/iovercast/manuscript-analysis/dDocent/SIMDATA/simhi/simhi_fastqs/TotalRawSNPs.vcf" #prog = "ipyrad-simlarge" #filename = "/home/iovercast/manuscript-analysis/ipyrad/SIMDATA/simlarge/simlarge_outfiles/simlarge-biallelic.recode.vcf" prog = "pyrad-simhi" filename = "/home/iovercast/manuscript-analysis/pyrad/SIMDATA/simhi/outfiles/c85d6m2p3H3N3.vcf" v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) print("loc_cov", loci_coverage(v, c, prog)) print("snp_cov", snp_coverage(c)) print("snps", sample_nsnps(c)) print("nlocs", sample_nloci(v, c, prog)) # # Everything Below here is crap! # # Everything Below here is crap! # In[49]: ## Maybe useful print(v.ID) print(len(v.ID)) print(len(set([x.split("_")[0] for x in v.ID]))) # In[37]: import itertools import numpy.ma as ma print(v.ID) print(len(v.ID)) loc_list = [] loci = set([x.split("_")[0] for x in v.ID]) print(len(loci)) loc_list = [list(val) for key,val in itertools.groupby(v.ID,key=lambda x:x.split("_")[0])] print(len(loc_list)) ## experimentation print(v.dtype) print(c.dtype) print(np.mean(c.DP)) print(loc_list[0:10]) subsamp = np.array([np.random.choice(x, 1, replace=False)[0] for x in loc_list]) mask = np.in1d(v.ID, subsamp) # In[55]: g = allel.GenotypeArray(cmask.genotype) gn = g.to_n_alt() m = allel.stats.rogers_huff_r(gn[:1000]) ** 2 ax = allel.plot.pairwise_ld(m) # In[ ]: ## Some crap from when i was counting loci a very stupid and slow way for stacks. ### This doesn't' count monomorphics, but could be useful still. It's dog slow. #vcf = pd.read_csv("{}/batch_1.vcf".format(simdir), delim_whitespace=True, header=9, index_col=2) #sample_counts = {} #sample_names = list(vcf.columns)[8:] #print(sample_names) #uniq_loci = set([x.split("_")[0] for x in vcf.index]) #print("num loci - {}".format(len(uniq_loci))) ## Replace the ID index in the stacks vcf file so we can actually evaluate loci #vcf.index = pd.Index([x.split("_")[0] for x in vcf.index]) #for name in sample_names: # print(name), # sample_counts[name] = 0 # for locus in uniq_loci: # snps = vcf[name][locus] # if any(map(lambda x: x.split(":")[0] != "./.", snps)): # sample_counts[name] += 1 # else: # pass # #print("{} {} {}".format(name, locus, snps)) #print(sample_counts) #sim_full_sample_nlocs["stacks_"+stacks_method+"-"+sim] = sample_coverage # In[191]: filename = os.path.join(DDOCENT_OUTPUT, "Final.recode.snps.vcf") v_filt = vcfnp.variants(filename).view(np.recarray) c_filt = vcfnp.calldata_2d(filename).view(np.recarray) # In[220]: from collections import Counter print(v_filt.dtype) #wat = Counter(vcall["POS"]) #wat ## DDOCENT v_filt["is_snp"] print(v_filt[0:2]) print(v_filt["NS"][:40]) print(v_filt["NUMALT"][:40]) print(v_filt["ALT"][:40]) loci = set(v_filt["CHROM"]) # In[200]: print(c.dtype) print(len(c)) print(c["GT"][0:10][:,0]) print((c["GT"][0:10000][:,0] == "./.").sum()) print(c["GT"][0:5] != "./.") print((x != "./.").sum() for x in c["GT"][0:5]) from collections import Counter loci_coverage = [x.sum() for x in c_filt["GT"].T != "./."] print(loci_coverage) # In[599]: #x = sim_loc_cov #x = sim_snp_cov #x = sim_sample_nsnps x = sim_sample_nlocs for sim in ["-simno", "-simlo", "-simhi", "-simlarge"]: for prog in ["ipyrad", "pyrad", "stacks_ungapped", "stacks_gapped", "stacks_defualt", "ddocent_filt", "ddocent_full", "aftrrad"]: try: print(prog+sim+"\t"), print(x[prog+sim]), print(np.mean(x[prog+sim])) except: print("No {}".format(prog+sim)) print("------------------------------------------------------") # In[338]: if True: keys = ["e", "b", "b", "c", "d", "e", "e", 'a'] values = [1.2, 4.5, 4.3, 2.0, 5.67, 8.08, 9.01,1] print('two methods of splitting by group') print('as list') for k,v in zip(*npi.group_by(keys)(values)): print(k, v) print('as iterable') g = npi.group_by(keys) for k,v in zip(g.unique, g.split_sequence_as_iterable(values)): print(k, list(v)) print('iterable as iterable') for k, v in zip(g.unique, g.split_iterable_as_iterable(values)): print(k, list(v))