#!/usr/bin/env python # coding: utf-8 # In[32]: from Bio import Phylo import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import collections import matplotlib import subprocess import os plt.rcParams["figure.figsize"] = [12,9] ## Set the default directories for exec and data. WORK_DIR="/home/iovercast/manuscript-analysis/" EMPERICAL_DATA_DIR=os.path.join(WORK_DIR, "example_empirical_rad/") IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/") PYRAD_EMPIRICAL_OUTPUT=os.path.join(WORK_DIR, "pyrad/REALDATA/") STACKS_DIR=os.path.join(WORK_DIR, "stacks/") AFTRRAD_DIR=os.path.join(WORK_DIR, "aftrRAD/") DDOCENT_DIR=os.path.join(WORK_DIR, "dDocent/") # ## Install raxml # In[ ]: get_ipython().run_cell_magic('bash', '-s "$WORK_DIR"', '## Install raxml\nmkdir $1/miniconda/src\ncd $1/miniconda/src\ngit clone https://github.com/stamatak/standard-RAxML.git\ncd standard-RAxML\nmake -f Makefile.PTHREADS.gcc\ncp raxml-PTHREADS $1/miniconda/bin\n') # In[60]: vcf_dict = {} vcf_dict["ipyrad"] = os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA-biallelic.recode.vcf") vcf_dict["pyrad"] = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf") vcf_dict["stacks_ungapped"] = os.path.join(STACKS_DIR, "REALDATA/ungapped/batch_1.vcf") vcf_dict["stacks_gapped"] = os.path.join(STACKS_DIR, "REALDATA/gapped/batch_1.vcf") vcf_dict["stacks_default"] = os.path.join(STACKS_DIR, "REALDATA/default/batch_1.vcf") vcf_dict["aftrrad"] = os.path.join(AFTRRAD_DIR, "REALDATA/Formatting/REALDATA.vcf") #vcf_dict["ddocent_full"] = os.path.join(DDOCENT_DIR, "REALDATA/TotalRawSNPs.vcf") vcf_dict["ddocent_filt"] = os.path.join(DDOCENT_DIR, "REALDATA/Final.snpsonly.recode.vcf") for k, v in vcf_dict.items(): if os.path.exists(v): print("found - {}".format(v)) else: print("not found - {}".format(v)) # ## Make phylip files for all assemblers # In[61]: ## Get this script to convert from vcf to phy ## This script wants python 3, so I had to go in and add the future import for ## print by hand. Add this on line 17: ## ## from __future__ import print_function ## ## Also, had to update lines 32-34 like so: ## ## iupac = {"AG": "R", "CT": "Y", "CG": "S", "AT": "W", "GT": "K", "AC": "M", ## "CGT": "B", "AGT": "D", "ACT": "H", "ACG": "V", "ACGT": "N", "AA": "A", ## "CC": "C", "GG": "G", "TT":"T", "GN":"N", "CN":"N", "AN":"N", "TN":"N"} ## "NT":"N", "NC":"N", "NG":"N", "NA":"N", "NN":"N"} ## Also, the when we decompose the ddocent complex genotypes vcftools splits ## ./. data as '.', which VCF2phy.py kinda hates, so you have to add this piece ## of code on lines 75/75: ## ## if genotype_string == ".": ## return "N" #!wget https://raw.githubusercontent.com/CoBiG2/RAD_Tools/master/VCF2phy.py ped_raxdir = os.path.join(WORK_DIR, "Pedicularis_raxml") if not os.path.exists(ped_raxdir): os.mkdir(ped_raxdir) os.chdir(ped_raxdir) phy_dict = {} for k, v in vcf_dict.items(): if os.path.exists(v): outphy = v.rsplit("/", 1)[1] outphy = os.path.join(ped_raxdir, k + "-" + outphy.rsplit(".", 1)[0]) print(k, outphy) cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy) try: ret = subprocess.check_output(cmd, shell=True) except: print(cmd) print("failed - {}".format(k)) #print(ret) phy_dict[k] = outphy # ## Run raxml on the phylip files # In[63]: ped_trees = {} ## Gonna have to hope raxml is smart enough to use the fasta cuz the vcf we get out of ## aftrrad is broke. phy_dict["aftrrad"] = "/home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/SNPMatrix_17.All.fasta"\ for assembler, inputfile in phy_dict.items(): outdir = ped_raxdir ncores = 20 physplit = 2 cmd = "{}raxmlHPC-PTHREADS -f a ".format(WORK_DIR + "/miniconda/bin/") \ + " -T {} ".format(ncores) \ + " -m GTRGAMMA " \ + " -N 100 " \ + " -x 12345 " \ + " -p 54321 " \ + " -n {} ".format(assembler) \ + " -w {} ".format(outdir) \ + " -s {}".format(inputfile + ".phy") print(cmd) ## What's the difference? ## out_trees[assembler] = "{}/RAxML_bestTree.{}".format(outdir, assembler) ped_trees[assembler] = "{}/RAxML_bipartitions.{}".format(outdir, assembler) print(assembler) #!time $cmd # In[68]: print(ped_trees) # ## Make the pretty colors for each population # In[130]: ## Get sample names and assign them to a species dict IPYRAD_STATS=os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA_stats.txt") infile = open(IPYRAD_STATS).readlines() sample_names = [x.strip().split()[0] for x in infile[20:33]] species = set([x.split("_")[1] for x in sample_names]) species_dict = collections.OrderedDict([]) ## Ordered dict of sample names and their species, in same order ## as the vcf file for s in sample_names: species_dict[s] = s.split("_")[1] print(species_dict) ## Map species names to groups of individuals #for s in species: # species_dict[s] = [x for x in sample_names if s in x] print(species_dict) species_colors = { 'rex': '#FF0000', 'cyathophylla': '#008000', 'thamno': '#00FFFF', 'cyathophylloides': '#90EE90', 'przewalskii': '#FFA500', 'superba': '#8B0000', } emp_colors_per_sample = {} for name, pop in species_dict.items(): emp_colors_per_sample[name + "_R1_"] = species_colors[pop] print(emp_colors_per_sample) samples_per_species = {} for s in species: samples_per_species[s] = [x + "_R1_" for x in sample_names if s in x] print(samples_per_species) # ## Plot the trees # In[131]: treefiles = ped_trees legend_colors = [matplotlib.patches.Patch(color=v, label=k) for k,v in species_colors.items()] ## Root the tree on the WBS pop outgroup = [{'name': sample_name} for sample_name in samples_per_species["przewalskii"]] f, axarr = plt.subplots(6, 1, figsize=(15, 30), dpi=1000) #axarr = [a for b in axarr for a in b] for name, ax in zip(treefiles.keys(), axarr): print(name, ax) try: tree = Phylo.read(treefiles[name], 'newick') outgroup = [{"name":x.name} for x in tree.get_terminals() if "prze" in x.name] if "ddocent" in name or "ipyrad" in name: # outgroup = [{g.keys():g.values().split("_R1_")[0] for g in outgroup}] emp_colors_per_sample = {k.split("_R1_")[0]:v for k,v in emp_colors_per_sample.items()} tree.root_with_outgroup(*outgroup) tree.ladderize() ## This could be cool but doesn't work Phylo.draw(tree, axes=ax, label_colors=emp_colors_per_sample, do_show=False) ax.set_title(name) ax.legend(handles=legend_colors, loc="lower left") except Exception as inst: print("failed - {}".format(name)) print(inst) # In[125]: pzew = [{"name":x.name} for x in tree.get_terminals() if "prze" in x.name] print(pzew) # In[ ]: