#!/usr/bin/env python # coding: utf-8 # # Results for ipyrad vs stacks vs dDocent simulated and empirical reference sequence assembly # I'm assuming here that you've already run the de novo assemblies and have already installed all the prereqs for # analysing the results. If not run the ipyrad-manuscript-results.ipynb. # In[1]: ## Imports and working/output directories directories import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib plt.rcParams["figure.figsize"] = [12,9] from collections import Counter import seaborn as sns sns.set_style('white') sns.set_style('ticks') import toyplot import toyplot.html ## toypot sublib for saving html plots import pandas as pd import numpy as np import subprocess import collections import allel import vcfnp import shutil import gzip import glob import os from allel.util import * ## for ensure_square() WORK_DIR="/home/iovercast/manuscript-analysis/" ## Simulation dirs REFMAP_SIM_DIR = os.path.join(WORK_DIR, "REFMAP_SIM/") IPYRAD_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ipyrad/reference-assembly/") STACKS_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "stacks/") DDOCENT_SIM_DIR = os.path.join(REFMAP_SIM_DIR, "ddocent/") ## A list of the simluated assembler names for indexing dicts assemblers = ["ipyrad-reference", "ipyrad-denovo_reference", "stacks", "ddocent"] ## Empirical dirs REFMAP_EMPIRICAL_DIR=os.path.join(WORK_DIR, "Phocoena_empirical/") IPYRAD_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ipyrad/") STACKS_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "stacks/") DDOCENT_REFMAP_DIR=os.path.join(REFMAP_EMPIRICAL_DIR, "ddocent/") # # Some helpful functions # ## Function for plotting PCA given an input vcf file # In[334]: ## LD pruning code from the scikit-allele cookbook def plot_ld(gn, title): m = allel.stats.rogers_huff_r(gn) ** 2 ax = allel.plot.pairwise_ld(m) ax.set_title(title) def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5): for i in range(n_iter): loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn def plotPCA(call_data, title): c = call_data g = allel.GenotypeArray(c.genotype) ac = g.count_alleles() ## Filter singletons and multi-allelic snps flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) gf = g.compress(flt, axis=0) gn = gf.to_n_alt() coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson') fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(1, 1, 1) sns.despine(ax=ax, offset=5) x = coords1[:, 0] y = coords1[:, 1] ## We know this works because the species_dict and the columns in the vcf ## are in the same order. for sp in species: flt = (np.array(species_dict.values()) == sp) ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=species_colors[sp], label=sp, markersize=10, mec='k', mew=.5) ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100)) ax.legend(bbox_to_anchor=(1, 1), loc='upper left') fig.suptitle(title+" pca", y=1.02, style="italic", fontsize=20, fontweight='bold') fig.tight_layout() def getPCA(call_data): c = call_data g = allel.GenotypeArray(c.genotype) ac = g.count_alleles() ## Filter singletons and multi-allelic snps flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) gf = g.compress(flt, axis=0) gn = gf.to_n_alt() ## Test w/ ld pruning. Doesn't appreciably change the results. #gnu = ld_prune(gn) #gn = gnu coords1, model1 = allel.stats.pca(gn, n_components=10, scaler='patterson') return coords1, model1 ## We don't actually use this function bcz the allele.plot.pairwise_distance() ## call returns a raster which looks nasty in print. def plotPairwiseDistance(call_data, title): c = call_data #c = vcfnp.calldata_2d(filename).view(np.recarray) g = allel.GenotypeArray(c.genotype) gn = g.to_n_alt() dist = allel.stats.pairwise_distance(gn, metric='euclidean') allel.plot.pairwise_distance(dist, labels=species_dict.keys()) def getDistances(call_data): c = call_data #c = vcfnp.calldata_2d(filename).view(np.recarray) g = allel.GenotypeArray(c.genotype) gn = g.to_n_alt() dist = allel.stats.pairwise_distance(gn, metric='euclidean') return(dist) # ## Function for plotting distribution of variable sites across loci # In[3]: ## Inputs to this function are two Counters where keys ## are the base position and values are snp counts. ## The counter doesn't have to be sorted because we sort internally. def SNP_position_plot(prog, distvar, distpis): ## The last position to consider maxend = np.array(distvar.keys()).max() ## This does two things, first it sorts in increasing ## order. Second, it creates a count bin for any position ## without snps and sets the count to 0. distvar = [distvar[x] for x in xrange(maxend)] distpis = [distpis[x] for x in xrange(maxend)] ## set color theme colormap = toyplot.color.Palette() ## make a canvas canvas = toyplot.Canvas(width=800, height=300) ## make axes axes = canvas.cartesian(xlabel="Position along RAD loci", ylabel="N variables sites", gutter=65) axes.label.text = prog ## x-axis axes.x.ticks.show = True axes.x.label.style = {"baseline-shift":"-40px", "font-size":"16px"} axes.x.ticks.labels.style = {"baseline-shift":"-2.5px", "font-size":"12px"} axes.x.ticks.below = 5 axes.x.ticks.above = 0 axes.x.domain.max = maxend axes.x.ticks.locator = toyplot.locator.Explicit( range(0, maxend, 5), map(str, range(0, maxend, 5))) ## y-axis axes.y.ticks.show=True axes.y.label.style = {"baseline-shift":"40px", "font-size":"16px"} axes.y.ticks.labels.style = {"baseline-shift":"5px", "font-size":"12px"} axes.y.ticks.below = 0 axes.y.ticks.above = 5 ## add fill plots x = np.arange(0, maxend) f1 = axes.fill(x, distvar, color=colormap[0], opacity=0.5, title="total variable sites") f2 = axes.fill(x, distpis, color=colormap[1], opacity=0.5, title="parsimony informative sites") ## add a horizontal dashed line at the median Nsnps per site axes.hlines(np.median(distvar), opacity=0.9, style={"stroke-dasharray":"4, 4"}) axes.hlines(np.median(distpis), opacity=0.9, style={"stroke-dasharray":"4, 4"}) return canvas, axes # ## Functions for polling stats from vcf call data # In[4]: import numpy_indexed as npi ## Get the number of samples with data at each snp def snp_coverage(call_data): snp_counts = collections.Counter([x.sum() for x in call_data["GT"] != "./."]) ## Fill zero values return [snp_counts[x] for x in xrange(1, np.array(snp_counts.keys()).max()+1)] ## Get the number of samples with data at each locus def loci_coverage(var_data, call_data, assembler): if "stacks" in assembler: loci = zip(*npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.")) else: loci = zip(*npi.group_by(var_data["CHROM"])(call_data["GT"] != "./.")) counts_per_snp = [] for z in xrange(0, len(loci)): counts_per_snp.append([x.sum() for x in loci[z][1]]) counts = collections.Counter([np.max(x) for x in counts_per_snp]) ## Fill all zero values return [counts[x] for x in xrange(1, np.array(counts.keys()).max()+1)] ## Get total number of snps per sample def sample_nsnps(call_data): return [x.sum() for x in call_data["GT"].T != "./."] ## Get total number of loci per sample def sample_nloci(var_data, call_data, assembler): if "stacks" in assembler: locus_groups = npi.group_by(map(lambda x: x.split("_")[0],var_data["ID"]))(call_data["GT"] != "./.") else: locus_groups = npi.group_by(v["CHROM"])(c["GT"] != "./.") by_locus = [x.T for x in locus_groups[1]] by_sample = np.array([(x).any(axis=1) for x in by_locus]) return [x.sum() for x in by_sample.T] # ## End housekeeping. Begin actual analysis of results. # # # Results from simulated data # # ## First lets look at results just from the vcf files (so this is only looking at variable loci). # The first thing we'll do is create a dataframe for storing a bunch # of coverage information from the runs for each method. # In[29]: ## Make a new pandas dataframe for holding the coverage results sim_vcf_dict = {} sim_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf") sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf") sim_vcf_dict["ipyrad-denovo_plus_reference"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf") sim_vcf_dict["stacks"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf") sim_vcf_dict["ddocent"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf") ## Make sure we have all the vcf files for k, f in sim_vcf_dict.items(): if os.path.exists(f): print("found - {}".format(f)) else: print("not found - {}".format(f)) # ### Pull depth and coverage stats out of the vcf files # Get number of loci per sample, and locus coverage for each vcf file from each assembler and each simulated dataset. This is reading in from the vcf files from each assembly method. This is nice because vcf is relatively standard and all the tools can give us a version of vcf. It's not perfect though because it doesn't include information about monomorphic sites, so it doesn't tell us the true number of loci recovered. We can get an idea of coverage and depth at snps, but to get coverage and depth stats across all loci we need to dig into the guts of the output of each method (which we'll do later). # In[10]: import collections sim_loc_cov = collections.OrderedDict() sim_snp_cov = collections.OrderedDict() sim_sample_nsnps = collections.OrderedDict() sim_sample_nlocs = collections.OrderedDict() ## Try just doing them all the same for prog, filename in sim_vcf_dict.items(): try: print("Doing - {}".format(prog)) print(" {}".format(filename)) v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray) c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray) sim_snp_cov[prog] = snp_coverage(c) sim_sample_nsnps[prog] = sample_nsnps(c) sim_loc_cov[prog] = loci_coverage(v, c, prog) sim_sample_nlocs[prog] = sample_nloci(v, c, prog) except Exception as inst: print(inst) # ### Print out the results # This isn't very helpful, but you get an idea of what's going on. The ddocent results for locus coverage and number of loci per sample are ugly because for reference sequence mapping the ddocent output vcf doesn't record this information. It only retains CHROM and POS, CHROM being "MT" here and POS being the base position of each snp. # In[11]: for statname, stat in {"sim_loc_cov":sim_loc_cov, "sim_snp_cov":sim_snp_cov,\ "sim_sample_nsnps":sim_sample_nsnps, "sim_sample_nlocs":sim_sample_nlocs}.items(): for prog in sim_vcf_dict.keys(): try: print(prog + " " + statname + "\t"), print(stat[prog ]), print(np.mean(stat[prog])) except: print("No {} stats for {}".format(statname, prog)) print("------------------------------------------------------") # ## Pairwise difference and PCA plots for each simulation treatment # In[275]: ## Load the calldata into a dict so we don't have to keep loading and reloading it calldata = {} for prog in sim_vcf_dict.keys(): print("{}".format(prog)), print("{}".format(sim_vcf_dict[prog])) c = vcfnp.calldata_2d(sim_vcf_dict[prog], verbose=False).view(np.recarray) calldata[prog] = c # ### Set sample names and populations for nice plotting # Some housekeeping with sample names to make the PCA plots prettier # In[276]: pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"] pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"] pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"] sim_sample_names = pop1 + pop2 + pop3 pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3} pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"} flt = np.in1d(np.array(sim_sample_names), pop1) print(flt) # In[278]: f, axarr = plt.subplots(2,2, figsize=(10,10), dpi=1000) axarr = [a for b in axarr for a in b] for prog, ax in zip(assemblers, axarr): coords1, model1 = getPCA(calldata[prog]) x = coords1[:, 0] y = coords1[:, 1] ax.scatter(x, y, marker='o') ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100)) for pop in pops.keys(): flt = np.in1d(np.array(sim_sample_names), pops[pop]) ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5) ax.set_title(prog, style="italic") ax.axison = True f.tight_layout() # ### Plot pairwise distances for each assembler and each simulated datatype # In[279]: f, axarr = plt.subplots(2, 2, figsize=(10, 10), dpi=1000) axarr = [a for b in axarr for a in b] for prog, ax in zip(assemblers, axarr): ## Calculate pairwise distances dist = getDistances(calldata[prog]) ## Doing it this way works, but allel uses imshow internally which rasterizes the image #allel.plot.pairwise_distance(dist, labels=None, ax=ax, colorbar=False) ## Create the pcolormesh by hand dat = ensure_square(dist) ## for some reason np.flipud(dat) is chopping off one row of data p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\ cmap="jet", vmin=np.min(dist), vmax=np.max(dist)) ## Clip all heatmaps to actual sample size p.axes.axis("tight") ax.set_title(prog, style="italic") ax.axison = False # # Go through and pull in more fine grained results # What we want to know is total number of loci recovered (not just variable loci). First we'll create some dictionaries just like above, but we'll call them *_full_* to indicate that they include monomorphic sites. Because snps don't occur in monomorphic sites kind of by definition, we only really are iterested in the depth across loci and the number of loci recovered per sample. # # In[9]: ## Blank the ordered dicts for gathering locus coverage and sample nlocs sim_full_loc_cov = collections.OrderedDict() sim_full_sample_nlocs = collections.OrderedDict() # ## ipyrad simulated results # In[13]: assembly_methods = {"ipyrad-reference":"refmap-sim", "ipyrad-denovo_reference":"denovo_ref-sim"} for name, method in assembly_methods.items(): print(method) simdir = os.path.join(IPYRAD_SIM_DIR, method + "_outfiles/") statsfile = simdir + "{}_stats.txt".format(method) infile = open(statsfile).readlines() sample_coverage = [int(x.strip().split()[1]) for x in infile[20:32]] print(sample_coverage) print("mean sample coverage - {}\t".format(np.mean(sample_coverage))), print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage))) sim_full_sample_nlocs[name] = sample_coverage nmissing = [int(x.strip().split()[1]) for x in infile[38:50]] sim_full_loc_cov[name] = nmissing ## Just look at the ones we care about for ipyrad print(sim_full_loc_cov.items()) print(sim_full_sample_nlocs.items()) # ## stacks simulated results # In[14]: method = "stacks" simdir = STACKS_SIM_DIR try: lines = open("{}/batch_1.haplotypes.tsv".format(simdir)).readlines() cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] sim_full_loc_cov[method] = [cnts.count(i) for i in range(1,13)] except Exception as inst: print("loc_cov - {} - {}".format(inst, simdir)) try: sim_full_sample_nlocs[method] = [] samp_haps = glob.glob("{}/*matches*".format(simdir)) for f in samp_haps: lines = gzip.open(f).readlines() sim_full_sample_nlocs[method].append(len(lines) - 1) except Exception as inst: print("sample_nlocs - {} - {}".format(inst, simdir)) print(sim_full_loc_cov.items()) print(sim_full_sample_nlocs.items()) # ## dDocent simulated results # It is not straightforward how to get locus coverage and nloci per sample information from ddocent results. See the manuscript-analysis notebook for more explanation about why. # In[10]: DDOCENT_DIR = "/home/iovercast/manuscript-analysis/dDocent/" def ddocent_stats(RUNDIR): ## This is hackish. You have to dip into the bam file to get locus counts that ## include counts for monomorphics. Note! that field 3 of the bam file is not ## the sequence data, but rather the dDocent mock-contig ('dDocent_Contig_3') ## the read maps to. So really this is kind of by proxy counting the number of loci. sample_coverage = [] for samp in glob.glob(RUNDIR + "/*-RG.bam"): cmd = "{}samtools view {} | cut -f 4 | uniq | wc -l".format(DDOCENT_DIR, samp) res = subprocess.check_output(cmd, shell=True) sample_coverage.append(int(res.strip())) print("This must be taken with a grain of salt because these aren't the actual counts you get out of the final data files.") print("mean sample coverage - {}".format(np.mean(sample_coverage))) print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage))) cmd = "{}samtools view {}cat-RRG.bam | cut -f 3,4 | uniq".format(DDOCENT_DIR, RUNDIR) locus_positions = subprocess.check_output(cmd, shell=True).split("\n") locus_counter = collections.Counter() # Num loci per sample coverage_counter = collections.Counter() # Coverage per locus for loc in locus_positions: if not loc: continue chrom = loc.split()[0] pos = int(loc.split()[1]) cmd = "{}samtools view {}cat-RRG.bam {}:{}-{} | cut -f 1 | cut -f 3 -d \"_\" | sort | uniq".format(DDOCENT_DIR, RUNDIR, chrom, pos, pos+1) res = subprocess.check_output(cmd, shell=True).split() locus_counter.update(res) coverage_counter.update([len(res)]) ## Fill in zero values that aren't in the locus counter dat = [] for i in xrange(1,13): try: dat.append(coverage_counter[i]) except: dat.append(0) return dat, locus_counter.values() loc_cov, sample_nlocs = ddocent_stats(DDOCENT_SIM_DIR) sim_full_loc_cov["ddocent"] = loc_cov sim_full_sample_nlocs["ddocent"] = sample_nlocs print(sim_full_loc_cov.items()) print(sim_full_sample_nlocs.items()) # ## Plotting simulation results # In[313]: simlevel = ["simulated"] sim_sample_nlocs_df = pd.DataFrame(index=assemblers, columns=simlevel) for sim in simlevel: for ass in assemblers: simstring = ass sim_sample_nlocs_df[sim][ass] = np.mean(sim_full_sample_nlocs[simstring]) print("Mean number of loci recovered per sample.") ## Normalize all bins dat = sim_sample_nlocs_df[sim_sample_nlocs_df.columns].astype(float) for sim in simlevel: scale = 1000 # if "ipyrad-reference" == sim dat[sim] = dat[sim]/scale dat[sim]["ipyrad-reference"] *= 2 sns.heatmap(dat, square=True, center=1, linewidths=2, annot=True) print(sim_sample_nlocs_df) # ### Much better! Do the spline plot version of the above plot # It just looks nicer, makes more sense. # # __NB__: This is broken for refmap. # In[60]: dfsns2 = pd.DataFrame(columns = ["assembler", "simtype", "bin", "simdata"]) for sim in simlevel: for ass in assemblers: simstring = ass ## Normalize values so different sim sizes print the same max = 1000. newdat = sim_full_loc_cov[simstring] newdat = [sum(newdat)-sum(newdat[:i-1]) for i in range(1,13)] for i, val in enumerate(newdat): dfsns2.loc[simstring + "-" + str(i)] = [ass, sim, i+1, val] g = sns.FacetGrid(dfsns2, col="simtype", hue="assembler", size=1) g.map(plt.scatter, "bin", "simdata") g.map(plt.plot, "bin", "simdata").add_legend() #axs = g.axes #for ax in axs: # ax.set_xlim(0,12.5) # # Empirical Results (Phocoena) # This section is incomplete. See below for empirical and simulated results that are combined. # ## Process the vcf output from all the runs # Here we'll pull together all the output vcf files. This takes a few minutes to run. # In[123]: emp_vcf_dict = {} emp_vcf_dict["ipyrad-reference"] = os.path.join(IPYRAD_REFMAP_DIR, "refmap-empirical_outfiles/refmap-empirical.vcf") emp_vcf_dict["ipyrad-denovo_reference"] = os.path.join(IPYRAD_REFMAP_DIR, "denovo_ref-empirical_outfiles/denovo_ref-empirical.vcf") emp_vcf_dict["stacks"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf") emp_vcf_dict["ddocent"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf") # ## Read in vcf for each analysis and pull in coverage/depth stats # In[124]: emp_loc_cov = collections.OrderedDict() emp_snp_cov = collections.OrderedDict() emp_sample_nsnps = collections.OrderedDict() emp_sample_nlocs = collections.OrderedDict() ## Try just doing them all the same for prog, filename in emp_vcf_dict.items(): try: print("Doing - {}".format(prog)) print("\t{}".format(filename)) v = vcfnp.variants(filename, dtypes={"CHROM":"a24"}).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) emp_loc_cov[prog] = loci_coverage(v, c, prog) emp_snp_cov[prog] = snp_coverage(c) emp_sample_nsnps[prog] = sample_nsnps(c) emp_sample_nlocs[prog] = sample_nloci(v, c, prog) plotPCA(c, prog) plotPairwiseDistance(c, prog) except Exception as inst: print(inst) # In[608]: for i in emp_sample_nlocs: print(i), print(emp_sample_nlocs[i]) # ## ipyrad Empirical results # First we load in the variant info and call data for all the snps # In[70]: IPYRAD_EMPIRICAL_OUTPUT=os.path.join(IPYRAD_DIR, "REALDATA/") IPYRAD_STATS = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA_stats.txt") infile = open(IPYRAD_STATS).readlines() sample_coverage = [int(x.strip().split()[1]) for x in infile[20:33]] print(sample_coverage) print("mean sample coverage - {}".format(np.mean(sample_coverage))) print("min/max - {}/{}".format(np.min(sample_coverage), np.max(sample_coverage))) # Read the biallelic vcf # In[71]: filename = os.path.join(IPYRAD_EMPIRICAL_OUTPUT, "REALDATA_outfiles/REALDATA.biallelic.vcf") # filename = vcf_dict["ipyrad"] v = vcfnp.variants(filename).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) # ### Distribution of snps along loci # Getting variable sites and parsimony informative sites from the vcf is kind of annoying # because all the programs export __slightly__ different formats, so you need to # parse them in slightly different ways. There's a better way to do this for ipyrad # but i figure i'll do it the same way for all of them so it's more clear what's happening. # In[72]: ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x) for x in pis]) #distpis = [x for x in sorted(counts.items())] ## Getting the distvar is easier distvar = Counter([int(x) for x in v.POS]) #distvar = [x for x in sorted(counts.items())] canvas, axes = SNP_position_plot(distvar, distpis) ## save fig #toyplot.html.render(canvas, 'snp_positions.html') canvas # ## stacks empirical results # In[12]: STACKS_OUTPUT=os.path.join(STACKS_DIR, "REALDATA/") STACKS_GAP_OUT=os.path.join(STACKS_OUTPUT, "gapped/") STACKS_UNGAP_OUT=os.path.join(STACKS_OUTPUT, "ungapped/") STACKS_DEFAULT_OUT=os.path.join(STACKS_OUTPUT, "default/") #lines = open("SIMsmall/stackf_high/batch_1.haplotypes.tsv").readlines() #cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] #shigh = [cnts.count(i) for i in range(1,13)] # In[13]: filename = os.path.join(STACKS_UNGAP_OUT, "batch_1.vcf") v = vcfnp.variants(filename).view(np.recarray) c = vcfnp.calldata_2d(filename).view(np.recarray) # ### Distribution of snps along loci # In[14]: ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(v["ID"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x.split("_")[1]) for x in pis]) #distpis = [x for x in sorted(counts.items())] ## Getting the distvar is easier distvar = Counter([int(x.split("_")[1]) for x in v.ID]) #distvar = [x for x in sorted(counts.items())] canvas, axes = SNP_position_plot(distvar, distpis) ## save fig #toyplot.html.render(canvas, 'snp_positions.html') canvas # # SKIP to HERE # # Do both simulated and empirical at the same time # Could make the plots prettier # In[349]: vcf_dict = {} vcf_dict["ipyrad-reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "refmap-sim_outfiles/refmap-sim.vcf") vcf_dict["ipyrad-denovo_plus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_plus_reference-sim_outfiles/denovo_plus_reference-sim.vcf") vcf_dict["ipyrad-denovo_minus_reference-sim"] = os.path.join(IPYRAD_SIM_DIR, "denovo_minus_reference-sim_outfiles/denovo_minus_reference-sim.vcf") vcf_dict["stacks-sim"] = os.path.join(STACKS_SIM_DIR, "batch_1.vcf") vcf_dict["ddocent-tot-sim"] = os.path.join(DDOCENT_SIM_DIR, "TotalRawSNPs.snps.vcf.recode.vcf") vcf_dict["ddocent-fin-sim"] = os.path.join(DDOCENT_SIM_DIR, "Final.recode.snps.vcf.recode.vcf") vcf_dict["ipyrad-reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/refmap-empirical_outfiles/refmap-empirical.vcf") vcf_dict["ipyrad-denovo_plus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference_outfiles/denovo_ref-empirical.vcf") vcf_dict["ipyrad-denovo_plus_reference-095-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_plus_reference-095_outfiles/denovo_plus_reference-095.vcf") vcf_dict["ipyrad-denovo_minus_reference-empirical"] = os.path.join(IPYRAD_REFMAP_DIR, "reference-assembly/denovo_minus_reference_outfiles/denovo_ref-empirical.vcf") vcf_dict["stacks-empirical"] = os.path.join(STACKS_REFMAP_DIR, "batch_1.vcf") vcf_dict["ddocent-fin-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "Final.recode.snps.vcf.recode.vcf") # skipt the full ddocent vcf. It's huge and we know we don't want to use it. #vcf_dict["ddocent-tot-empirical"] = os.path.join(DDOCENT_REFMAP_DIR, "TotalRawSNPs.snps.vcf.recode.vcf") ## Make sure we have all the vcf files for k, f in vcf_dict.items(): if os.path.exists(f): print("found - {}".format(f)) else: print("not found - {}".format(f)) # ### Some magic to get samples and populations properly grouped and colored for pretty PCA plots # There has got to be a better way to do this for the empirical at least. It seems super hax, but it works. # In[56]: ## sim pop1 = ["1A_0", "1B_0", "1C_0", "1D_0"] pop2 = ["2E_0", "2F_0", "2G_0", "2H_0"] pop3 = ["3I_0", "3J_0", "3K_0", "3L_0"] sim_sample_names = pop1 + pop2 + pop3 sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3} sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"} ## empirical emp_pops = {} emp_sample_names = [] popfile = os.path.join(STACKS_REFMAP_DIR, "popmap.txt") with open(popfile) as infile: lines = [l.strip().split() for l in infile.readlines()] emp_sample_names = [x[0] for x in lines] pops = set([x[1] for x in lines]) for pop in pops: emp_pops[pop] = [] for line in lines: p = line[1] s = line[0] emp_pops[p].append(s) emp_pop_colors = {k:v for (k,v) in zip(emp_pops, list(matplotlib.colors.cnames))} ## Write out the samples to pop files for vcftools for outdir, pop_dict in {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops}.items(): for pop, samps in pop_dict.items(): with open(outdir + pop + ".txt", "w") as outfile: outfile.write("\n".join(samps)) # In[351]: ## The order of the samples in the vcf file is wonky. Reorder them so they're sorted by population. This ## is roughly the order of populations from the manuscript. ## If you run this multiple times, you should re-run the previous cell that resets ## the vcf file to the original path, otherwise you get a bunch of *.reorder.reorder.vcf files. order = ["WBS", "IS", "NOS", "SK1", "KB1", "BES2", "IBS"] vcf_header = """##fileformat=VCFv4.1\n##fileDate=20161230\n##source="Stacks v1.42"\n##INFO=\n##INFO=\n##FORMAT=\n##FORMAT=\n##FORMAT=\n##FORMAT=\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT""" with open("/tmp/dummy.vcf", 'w') as outfile: outfile.write(vcf_header) for p in order: outfile.write("\t" + "\t".join(emp_pops[p])) for k, f in vcf_dict.items(): if "stacks" in k: ## The version of vcftools we have here doesn't like the newer version of vcf stacks ## writes, so we have to 'fix' the stacks vcf version print("Fixing stacks vcf file format - {}".format(k)) shutil.copy2(f, f+".bak") with open(f) as infile: dat = infile.readlines()[1:] with open(f, 'w') as outfile: outfile.write("##fileformat=VCFv4.1\n") outfile.write("".join(dat)) if "empirical" in k: print("reordering") vcftools_path = os.path.join(DDOCENT_DIR, "vcftools_0.1.11/perl/") os.chdir(vcftools_path) tmpvcf = "tmp.vcf" newvcf = f.rsplit(".", 1)[0] + ".reordered.vcf" print(newvcf) cmd = "perl vcf-shuffle-cols -t {} {} > {}".format("/tmp/dummy.vcf", f, tmpvcf) print(cmd) os.system(cmd) get_ipython().system('mv $tmpvcf $newvcf') ## Don't destroy the old one vcf_dict[k] = newvcf # In[352]: import collections ## Load the calldata into a dict so we don't have to keep loading and reloading it ## This can take several minutes for the large empirical datasets (like 20+ minutes) all_calldata = {} all_vardata = {} ## Dicts for tracking all the stats loc_cov = collections.OrderedDict() snp_cov = collections.OrderedDict() samp_nsnps = collections.OrderedDict() samp_nlocs = collections.OrderedDict() for prog, filename in vcf_dict.items(): try: print("Doing - {}\n {}".format(prog, filename)) v = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray) c = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray) all_calldata[prog] = c all_vardata[prog] = v snp_cov[prog] = snp_coverage(c) samp_nsnps[prog] = sample_nsnps(c) loc_cov[prog] = loci_coverage(v, c, prog) samp_nlocs[prog] = sample_nloci(v, c, prog) except Exception as inst: print(inst) # In[353]: for statname, stat in {"loc_cov":loc_cov, "snp_cov":snp_cov,\ "sample_nsnps":samp_nsnps, "sample_nlocs":samp_nlocs}.items(): for prog in vcf_dict.keys(): try: print(prog + " " + statname + "\t"), print(stat[prog]), print(np.mean(stat[prog])) except: print("No {} stats for {}".format(statname, prog)) print("------------------------------------------------------") # ## Filter linked snps? Doesn't make a difference. # In[354]: f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000) ## Make a list out of the axes axarr = [a for b in axarr for a in b] ## Set them in order so the plot looks nice. progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\ "ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"] for prog, ax in zip(progs, axarr): print(prog, ax) if "empirical" in prog: pop_colors = emp_pop_colors pops = emp_pops sample_names = emp_sample_names else: pop_colors = sim_pop_colors pops = sim_pops sample_names = sim_sample_names ## Don't die if some of the runs aren't complete try: coords1, model1 = getPCA(all_calldata[prog]) except: continue x = coords1[:, 0] y = coords1[:, 1] ax.scatter(x, y, marker='o') ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100)) for pop in pops.keys(): flt = np.in1d(np.array(sample_names), pops[pop]) ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5) ax.set_title(prog, style="italic") ax.axison = True axarr[0].legend(frameon=True) axarr[5].legend(loc='lower left', frameon=True) f.tight_layout() # In[355]: f, axarr = plt.subplots(2, 5, figsize=(20,8), dpi=1000) axarr = [a for b in axarr for a in b] ## Set them in order so the plot looks nice. progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\ "ipyrad-reference-empirical", "ipyrad-denovo_plus_reference-095-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"] for prog, ax in zip(progs, axarr): print(prog, ax) try: ## Calculate pairwise distances dist = getDistances(all_calldata[prog]) except: continue ## Create the pcolormesh by hand dat = ensure_square(dist) ## for some reason np.flipud(dat) is chopping off one row of data p = ax.pcolormesh(np.arange(0,len(dat[0])), np.arange(0,len(dat[0])), dat,\ cmap="jet", vmin=np.min(dist), vmax=np.max(dist)) ## Clip all heatmaps to actual sample size p.axes.axis("tight") ax.set_title(prog, style="italic") ax.axison = False ## Adjust margins to make room for the colorbar plt.subplots_adjust(left=0.05, bottom=0.1, right=0.8, top=0.9, wspace=0.2, hspace=0.3) ## Add the colorbar cax = f.add_axes([0.87, 0.4, 0.03, 0.5]) cb1 = matplotlib.colorbar.ColorbarBase(cax, cmap="jet", orientation="vertical", ticks=([0,1])) cb1.ax.set_yticklabels(['Similar', "Dissimilar"], weight="bold", rotation="vertical") # ## Pull in finer grained stats for each assembler # For ipyrad and stacks this is very quick, but the whole step is slow because ddocent empirical # stats take _forever_ to obtain. # In[357]: ## Blank the ordered dicts for gathering locus coverage and sample nlocs all_full_loc_cov = collections.OrderedDict() all_full_sample_nlocs = collections.OrderedDict() ## Mapping between 'pretty names' and directory names. This is only really useful for ipyrad ## where i named the directories a little silly. For stacks and ddocent this doesn't really do anything. assembly_methods = {"ipyrad-reference-sim":"refmap-sim", "ipyrad-denovo_reference-sim":"denovo_ref-sim",\ "stacks-sim":"stacks-sim", "ipyrad-reference-empirical":"refmap-empirical",\ "stacks-empirical":"stacks-empirical", "ddocent-simulated":"ddocent-simulated",\ "ddocent-empirical":"ddocent-empirical",\ "ipyrad_denovo_minus_reference-sim":"denovo_minus_reference-sim",\ "ipyrad_denovo_plus_reference-sim":"denovo_plus_reference-sim",\ "ipyrad_denovo_minus_reference-empirical":"denovo_minus_reference",\ "ipyrad_denovo_plus_reference-empirical":"denovo_plus_reference-095" } for name, method in assembly_methods.items(): print("Doing - {}".format(name)) if "ipyrad" in name: outdir = IPYRAD_SIM_DIR firstsamp = 20 lastsamp = 32 if "empirical" in name: outdir = IPYRAD_REFMAP_DIR + "reference-assembly/" firstsamp = 20 #fixme lastsamp = 64 # fixme try: nsamps = lastsamp - firstsamp simdir = os.path.join(outdir, method + "_outfiles/") statsfile = simdir + "{}_stats.txt".format(method) infile = open(statsfile).readlines() sample_coverage = [int(x.strip().split()[1]) for x in infile[firstsamp:lastsamp]] print("mean sample coverage - {}\t".format(np.mean(sample_coverage))), print("min/max - {}/{}\t".format(np.min(sample_coverage), np.max(sample_coverage))) all_full_sample_nlocs[name] = sample_coverage nmissing = [int(x.strip().split()[1]) for x in infile[lastsamp+6:lastsamp+6+nsamps]] all_full_loc_cov[name] = nmissing except Exception as inst: print(inst) elif "stacks" in name: outdir = STACKS_SIM_DIR nsamps = 12 if "empirical" in name: outdir = STACKS_REFMAP_DIR nsamps = 44 try: ## Effectively the same as thisjjjj ## cut -f 2 batch_1.haplotypes.tsv | sort -n | uniq -c | less lines = open("{}/batch_1.haplotypes.tsv".format(outdir)).readlines() cnts = [int(field.strip().split("\t")[1]) for field in lines[1:]] all_full_loc_cov[method] = [cnts.count(i) for i in range(1,nsamps+1)] except Exception as inst: print("loc_cov - {} - {}".format(inst, simdir)) try: all_full_sample_nlocs[method] = [] ## This is actually number of loci ## cut -f 3 *matches | sort -n | uniq | wc -l ## Right now this is actually counting number of snps samp_haps = glob.glob("{}/*matches*".format(outdir)) for f in samp_haps: lines = gzip.open(f).readlines() all_full_sample_nlocs[method].append(len(lines) - 1) except Exception as inst: print("sample_nlocs - {} - {}".format(inst, outdir)) # elif "ddocent" in name: # outdir = DDOCENT_SIM_DIR # if "empirical" in name: # outdir = DDOCENT_REFMAP_DIR # loc_cov, sample_nlocs = ddocent_stats(outdir) # all_full_loc_cov[method] = loc_cov # all_full_sample_nlocs[method] = sample_nlocs else: print("wtf?") try: print(all_full_loc_cov[name]) print(all_full_sample_nlocs[name]) except: print("no data for - {}".format(name)) pass print("Done - {}\n\n".format(name)) print(all_full_loc_cov.items()) print(all_full_sample_nlocs.items()) # ## Get pairwise fst for each population for simulated and empirical # Using vcftools just to get the mean fst between each population pair. # In[359]: import itertools ## dict for storing fst values. This is a dict of dictionaries. The top level is for ## each assembly type and within each assembly type is a dict of pop pairs and the fst between them. fsts = {} ## Get pairwise fst for each pop for sim and empirical d = {REFMAP_SIM_DIR:sim_pops, REFMAP_EMPIRICAL_DIR:emp_pops} for k, v in vcf_dict.items(): if not os.path.exists(v): continue if "sim" in k: indir = REFMAP_SIM_DIR pop_dict = sim_pops else: indir = REFMAP_EMPIRICAL_DIR pop_dict = emp_pops os.chdir(indir) try: print("Doing - {}".format(k)) fsts[k] = {} combinations = list(itertools.combinations(pop_dict, 2)) for pair in combinations: pop1 = indir + pair[0] + ".txt" pop2 = indir + pair[1] + ".txt" vcftools_cmd = DDOCENT_DIR + "vcftools" cmd = "{} --vcf {} --weir-fst-pop {} --weir-fst-pop {} --out {}".format(vcftools_cmd, v, pop1, pop2, k) ret = subprocess.check_output(cmd, shell=True) ret = ret.split("\n") ## Get the mean fst from the output fst = [x for x in ret if "Weir and Cockerham mean" in x][0].split(": ")[1] weighted_fst = [x for x in ret if "Weir and Cockerham weighted" in x][0].split(": ")[1] print("-".join(pair), "{}/{}".format(fst, weighted_fst)), fsts[k]["-".join(pair)] = "{}/{}".format(fst, weighted_fst) print("\n") except Exception as inst: print("Failed - {} - {}".format(k, inst)) ## Allow it to fail if the vcf doesn't exist. pass # In[360]: from IPython.display import display import pandas as pd ## Organize and pretty print the fsts dict df_sim = pd.DataFrame(index=sim_pops.keys(), columns=sim_pops.keys(), dtype=str).fillna("") df_emp = pd.DataFrame(index=emp_pops.keys(), columns=emp_pops.keys(), dtype=str).fillna("") for assembler, all_data in fsts.items(): print("{}".format(assembler)) if "sim" in assembler: indir = REFMAP_SIM_DIR pop_dict = sim_pops df = df_sim else: indir = REFMAP_EMPIRICAL_DIR pop_dict = emp_pops df = df_emp ## Init the diagonal for p in pop_dict.keys(): df[p][p] = "" for p, fst_data in all_data.items(): #print(df) p1 = p.split("-")[0] p2 = p.split("-")[1] df[p1][p2] += " "+fst_data.split("/")[0][:6] df[p2][p1] += " "+fst_data.split("/")[1][:6] pd.set_option('display.max_colwidth',800) pd.set_option('display.width',80) df_sim.style.set_properties(**{'max-width':'10px'}) display(df_sim) display(df_emp) # ## Quick and dirty raxml trees # ipyrad and stacks both kindly write out phylip files, but ddocent doesn't, so we'll have to make one ourselves. # In[177]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', '## Install raxml\nmkdir $1/miniconda/src\ncd $1/miniconda/src\ngit clone https://github.com/stamatak/standard-RAxML.git\ncd standard-RAxML\nmake -f Makefile.PTHREADS.gcc\ncp raxml-PTHREADS $1/miniconda/bin\n') # In[211]: ## Get this script to convert from vcf to phy ## This script wants python 3, so I had to go in and add the future import for ## print by hand. Add this on line 17: ## ## from __future__ import print_function ## ## Also, had to update lines 32-34 like so: ## ## iupac = {"AG": "R", "CT": "Y", "CG": "S", "AT": "W", "GT": "K", "AC": "M", ## "CGT": "B", "AGT": "D", "ACT": "H", "ACG": "V", "ACGT": "N", "AA": "A", ## "CC": "C", "GG": "G", "TT":"T", "GN":"N", "CN":"N", "AN":"N", "TN":"N"} ## "NT":"N", "NC":"N", "NG":"N", "NA":"N", "NN":"N"} ## Also, the when we decompose the ddocent complex genotypes vcftools splits ## ./. data as '.', which VCF2phy.py kinda hates, so you have to add this piece ## of code on lines 75/75: ## ## if genotype_string == ".": ## return "N" #!wget https://raw.githubusercontent.com/CoBiG2/RAD_Tools/master/VCF2phy.py for k, v in vcf_dict.items(): if "stacks" in k and "sim" in k: outphy = v.rsplit(".", 1)[0] print(outphy) cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy) ret = subprocess.check_output(cmd, shell=True) print(ret) if "ddocent" in k: ## Skip doing the big boy for now if "empirical" in k and "Total" in v: continue ## Do not add .phy to outphy, the python script does it for us. outphy = v.rsplit(".", 1)[0] print(outphy) cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy) ret = subprocess.check_output(cmd, shell=True) print(ret) # ## Run raxml # For simulated datasets raxml runs very fast (<20 seconds per phylip file). # # For empirical it's much slower, maybe 1-2 days per assembler. Be sure to use the '.snps.phy' ipyrad file # or else raxml runs _forever_. # In[361]: for d in [REFMAP_SIM_DIR, REFMAP_EMPIRICAL_DIR]: raxoutdir = d + "raxml_outdir" if not os.path.exists(raxoutdir): os.mkdir(raxoutdir) emp_trees = {} sim_trees = {} for assembler, vcffile in vcf_dict.items(): if "sim" in assembler: outdir = REFMAP_SIM_DIR + "raxml_outdir" ncores = 2 out_trees = sim_trees physplit = 1 else: outdir = REFMAP_EMPIRICAL_DIR + "raxml_outdir" ncores = 20 out_trees = emp_trees physplit = 2 if "ipyrad" in assembler: inputfile = vcffile.rsplit(".", physplit)[0] + ".snps.phy" if "stacks" in assembler: inputfile = vcffile.rsplit(".", 2)[0] + ".phylip" if "ddocent" in assembler: inputfile = vcffile.rsplit(".", 1)[0] + ".phy" if not os.path.exists(inputfile): print("\nCan't find .phy or .phylip for - {}\n".format(inputfile.rsplit(".", 1)[0])) cmd = "{}raxmlHPC-PTHREADS -f a ".format(WORK_DIR + "/miniconda/bin/") \ + " -T {} ".format(ncores) \ + " -m GTRGAMMA " \ + " -N 100 " \ + " -x 12345 " \ + " -p 54321 " \ + " -n {} ".format(assembler) \ + " -w {} ".format(outdir) \ + " -s {}".format(inputfile) print(cmd) ## What's the difference? ## out_trees[assembler] = "{}/RAxML_bestTree.{}".format(outdir, assembler) out_trees[assembler] = "{}/RAxML_bipartitions.{}".format(outdir, assembler) #continue if "sim" in assembler: print(assembler) #!time $cmd if "empirical" in assembler: print(assembler) #!time $cmd # In[368]: #!conda install biopython from Bio import Phylo f, axarr = plt.subplots(1, 5, figsize=(20,8), dpi=1000) ## For more than 1 row uncomment this #axarr = [a for b in axarr for a in b] ## Pop membership and colors per pop were estabilshed above during the PCA #sim_sample_names = pop1 + pop2 + pop3 #sim_pops = {"pop1":pop1, "pop2":pop2, "pop3":pop3} #sim_pop_colors = {"pop1":"r", "pop2":"b", "pop3":"g"} sim_colors_per_sample = {} for samp in sim_sample_names: for pop in sim_pops: if samp in sim_pops[pop]: sim_colors_per_sample[samp] = sim_pop_colors[pop] ## Set them in order so the plot looks nice. #progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\ # "ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"] outgroup = [{'name': taxon_name} for taxon_name in pop3] for assembler, ax in zip(sim_trees.keys(), axarr): print(assembler, sim_trees[assembler], ax) tree = Phylo.read(sim_trees[assembler], 'newick') tree.ladderize() tree.root_with_outgroup(*outgroup) ## This could be cool but doesn't work Phylo.draw(tree, axes = ax, do_show=False, label_colors=sim_colors_per_sample) ax.set_title(assembler) plt.tight_layout() # # TESTING - Everything below here is crap # In[367]: del sim_trees['ddocent-tot-sim'] print(len(sim_trees)) print(sim_trees) # In[ ]: ## Set empirical colors emp_colors_per_sample = {} for samp in emp_sample_names: for pop in emp_pops: if samp in emp_pops[pop]: emp_colors_per_sample[samp] = emp_pop_colors[pop] # In[ ]: import ete3 # In[ ]: for prog in all_calldata.keys(): #for prog in ["ipyrad-reference-sim"]: print("Doing {}".format(prog)) try: c = all_calldata[prog] v = all_vardata[prog] except: print("Failed to load for {}".format(prog)) continue ## Get only parsimony informative sites ## Get T/F values for whether each genotype is ref or alt across all samples/loci is_alt_allele = map(lambda x: map(lambda y: 1 in y, x), c["genotype"]) ## Count the number of alt alleles per snp (we only want to retain when #alt > 1) alt_counts = map(lambda x: np.count_nonzero(x), is_alt_allele) ## Create a T/F mask for snps that are informative only_pis = map(lambda x: x < 2, alt_counts) ## Apply the mask to the variant array so we can pull out the position of each snp w/in each locus ## Also, compress() the masked array so we only actually see the pis pis = np.ma.array(np.array(v["POS"]), mask=only_pis).compressed() ## Now have to massage this into the list of counts per site in increasing order ## of position across the locus distpis = Counter([int(x) for x in pis]) #distpis = [x for x in sorted(counts.items())] ## Getting the distvar is easier distvar = Counter([int(x) for x in v.POS]) #distvar = [x for x in sorted(counts.items())] canvas, axes = SNP_position_plot(prog, distvar, distpis) ## save fig #toyplot.html.render(canvas, 'snp_positions.html') canvas # In[ ]: ## Checking out differences between 0.85 and 0.95 clustering threshold for ipyrad # In[339]: filename = "/home/iovercast/manuscript-analysis/Phocoena_empirical/ipyrad/reference-assembly/denovo_minus_reference-0.95_outfiles/denovo_ref-empirical.vcf" ## Only need to load once #myv = vcfnp.variants(filename, verbose=False, dtypes={"CHROM":"a24"}).view(np.recarray) #myc = vcfnp.calldata_2d(filename, verbose=False).view(np.recarray) f, axarr = plt.subplots(1, 2, figsize=(20,8), dpi=1000) ## Make a list out of the axes ## Set them in order so the plot looks nice. progs = ["ipyrad-reference-sim", "ipyrad-denovo_plus_reference-sim", "ipyrad-denovo_minus_reference-sim", "stacks-sim", "ddocent-fin-sim",\ "ipyrad-reference-empirical", "ipyrad-denovo_reference-empirical", "ipyrad-denovo_minus_reference-empirical", "stacks-empirical", "ddocent-fin-empirical"] rundict = {"0.85":all_calldata["ipyrad-denovo_minus_reference-empirical"], "0.95":myc} for prog, ax in zip(rundict.keys(), axarr): pop_colors = emp_pop_colors pops = emp_pops sample_names = emp_sample_names ## Don't die if some of the runs aren't complete try: coords1, model1 = getPCA(rundict[prog]) except: continue x = coords1[:, 0] y = coords1[:, 1] ax.scatter(x, y, marker='o') ax.set_xlabel('PC%s (%.1f%%)' % (1, model1.explained_variance_ratio_[0]*100)) ax.set_ylabel('PC%s (%.1f%%)' % (2, model1.explained_variance_ratio_[1]*100)) for pop in pops.keys(): flt = np.in1d(np.array(sample_names), pops[pop]) ax.plot(x[flt], y[flt], marker='o', linestyle=' ', color=pop_colors[pop], label=pop, markersize=10, mec='k', mew=.5) ax.set_title(prog, style="italic") ax.axison = True axarr[0].legend(frameon=True) # In[329]: def plot_ld(gn, title): m = allel.stats.rogers_huff_r(gn) ** 2 ax = allel.plot.pairwise_ld(m) ax.set_title(title) def ld_prune(gn, size=1000, step=1000, threshold=.3, n_iter=5): for i in range(n_iter): loc_unlinked = allel.stats.ld.locate_unlinked(gn, size=size, step=step, threshold=threshold) n = np.count_nonzero(loc_unlinked) n_remove = gn.shape[0] - n print('iteration', i+1, 'retaining', n, 'removing', n_remove, 'variants') gn = gn.compress(loc_unlinked, axis=0) return gn # In[293]: g = allel.GenotypeArray(myc.genotype) ac = g.count_alleles() ## Filter singletons and multi-allelic snps flt = (ac.max_allele() == 1) & (ac[:, :2].min(axis=1) > 1) gf = g.compress(flt, axis=0) gn = gf.to_n_alt() # In[303]: gnu = ld_prune(gn, size=50, step=200, threshold=.1, n_iter=5) # In[ ]: coords1, model1 = allel.pca(gnu, n_components=10, scaler='patterson')