from Bio import Phylo
import matplotlib.pyplot as plt
%matplotlib inline
import collections
import matplotlib
import subprocess
import os
plt.rcParams["figure.figsize"] = [12,9]
## Set the default directories for exec and data.
WORK_DIR="/home/iovercast/manuscript-analysis/"
EMPERICAL_DATA_DIR=os.path.join(WORK_DIR, "example_empirical_rad/")
IPYRAD_DIR=os.path.join(WORK_DIR, "ipyrad/")
PYRAD_EMPIRICAL_OUTPUT=os.path.join(WORK_DIR, "pyrad/REALDATA/")
STACKS_DIR=os.path.join(WORK_DIR, "stacks/")
AFTRRAD_DIR=os.path.join(WORK_DIR, "aftrRAD/")
DDOCENT_DIR=os.path.join(WORK_DIR, "dDocent/")
%%bash -s "$WORK_DIR"
## Install raxml
mkdir $1/miniconda/src
cd $1/miniconda/src
git clone https://github.com/stamatak/standard-RAxML.git
cd standard-RAxML
make -f Makefile.PTHREADS.gcc
cp raxml-PTHREADS $1/miniconda/bin
vcf_dict = {}
vcf_dict["ipyrad"] = os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA-biallelic.recode.vcf")
vcf_dict["pyrad"] = os.path.join(PYRAD_EMPIRICAL_OUTPUT, "outfiles/c85d6m2p3H3N3.vcf")
vcf_dict["stacks_ungapped"] = os.path.join(STACKS_DIR, "REALDATA/ungapped/batch_1.vcf")
vcf_dict["stacks_gapped"] = os.path.join(STACKS_DIR, "REALDATA/gapped/batch_1.vcf")
vcf_dict["stacks_default"] = os.path.join(STACKS_DIR, "REALDATA/default/batch_1.vcf")
vcf_dict["aftrrad"] = os.path.join(AFTRRAD_DIR, "REALDATA/Formatting/REALDATA.vcf")
#vcf_dict["ddocent_full"] = os.path.join(DDOCENT_DIR, "REALDATA/TotalRawSNPs.vcf")
vcf_dict["ddocent_filt"] = os.path.join(DDOCENT_DIR, "REALDATA/Final.snpsonly.recode.vcf")
for k, v in vcf_dict.items():
if os.path.exists(v):
print("found - {}".format(v))
else:
print("not found - {}".format(v))
found - /home/iovercast/manuscript-analysis/stacks/REALDATA/default/batch_1.vcf found - /home/iovercast/manuscript-analysis/pyrad/REALDATA/outfiles/c85d6m2p3H3N3.vcf found - /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf found - /home/iovercast/manuscript-analysis/ipyrad/REALDATA/REALDATA_outfiles/REALDATA-biallelic.recode.vcf not found - /home/iovercast/manuscript-analysis/stacks/REALDATA/gapped/batch_1.vcf found - /home/iovercast/manuscript-analysis/dDocent/REALDATA/Final.snpsonly.recode.vcf found - /home/iovercast/manuscript-analysis/stacks/REALDATA/ungapped/batch_1.vcf
## Get this script to convert from vcf to phy
## This script wants python 3, so I had to go in and add the future import for
## print by hand. Add this on line 17:
##
## from __future__ import print_function
##
## Also, had to update lines 32-34 like so:
##
## iupac = {"AG": "R", "CT": "Y", "CG": "S", "AT": "W", "GT": "K", "AC": "M",
## "CGT": "B", "AGT": "D", "ACT": "H", "ACG": "V", "ACGT": "N", "AA": "A",
## "CC": "C", "GG": "G", "TT":"T", "GN":"N", "CN":"N", "AN":"N", "TN":"N"}
## "NT":"N", "NC":"N", "NG":"N", "NA":"N", "NN":"N"}
## Also, the when we decompose the ddocent complex genotypes vcftools splits
## ./. data as '.', which VCF2phy.py kinda hates, so you have to add this piece
## of code on lines 75/75:
##
## if genotype_string == ".":
## return "N"
#!wget https://raw.githubusercontent.com/CoBiG2/RAD_Tools/master/VCF2phy.py
ped_raxdir = os.path.join(WORK_DIR, "Pedicularis_raxml")
if not os.path.exists(ped_raxdir):
os.mkdir(ped_raxdir)
os.chdir(ped_raxdir)
phy_dict = {}
for k, v in vcf_dict.items():
if os.path.exists(v):
outphy = v.rsplit("/", 1)[1]
outphy = os.path.join(ped_raxdir, k + "-" + outphy.rsplit(".", 1)[0])
print(k, outphy)
cmd = "python VCF2phy.py -vcf {} -o {}".format(v, outphy)
try:
ret = subprocess.check_output(cmd, shell=True)
except:
print(cmd)
print("failed - {}".format(k))
#print(ret)
phy_dict[k] = outphy
('stacks_default', '/home/iovercast/manuscript-analysis/Pedicularis_raxml/stacks_default-batch_1') ('pyrad', '/home/iovercast/manuscript-analysis/Pedicularis_raxml/pyrad-c85d6m2p3H3N3') ('aftrrad', '/home/iovercast/manuscript-analysis/Pedicularis_raxml/aftrrad-REALDATA') python VCF2phy.py -vcf /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/REALDATA.vcf -o /home/iovercast/manuscript-analysis/Pedicularis_raxml/aftrrad-REALDATA failed - aftrrad ('ipyrad', '/home/iovercast/manuscript-analysis/Pedicularis_raxml/ipyrad-REALDATA-biallelic.recode') ('ddocent_filt', '/home/iovercast/manuscript-analysis/Pedicularis_raxml/ddocent_filt-Final.snpsonly.recode') ('stacks_ungapped', '/home/iovercast/manuscript-analysis/Pedicularis_raxml/stacks_ungapped-batch_1')
ped_trees = {}
## Gonna have to hope raxml is smart enough to use the fasta cuz the vcf we get out of
## aftrrad is broke.
phy_dict["aftrrad"] = "/home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/SNPMatrix_17.All.fasta"\
for assembler, inputfile in phy_dict.items():
outdir = ped_raxdir
ncores = 20
physplit = 2
cmd = "{}raxmlHPC-PTHREADS -f a ".format(WORK_DIR + "/miniconda/bin/") \
+ " -T {} ".format(ncores) \
+ " -m GTRGAMMA " \
+ " -N 100 " \
+ " -x 12345 " \
+ " -p 54321 " \
+ " -n {} ".format(assembler) \
+ " -w {} ".format(outdir) \
+ " -s {}".format(inputfile + ".phy")
print(cmd)
## What's the difference?
## out_trees[assembler] = "{}/RAxML_bestTree.{}".format(outdir, assembler)
ped_trees[assembler] = "{}/RAxML_bipartitions.{}".format(outdir, assembler)
print(assembler)
#!time $cmd
/home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n stacks_default -w /home/iovercast/manuscript-analysis/Pedicularis_raxml -s /home/iovercast/manuscript-analysis/Pedicularis_raxml/stacks_default-batch_1.phy stacks_default /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ddocent_filt -w /home/iovercast/manuscript-analysis/Pedicularis_raxml -s /home/iovercast/manuscript-analysis/Pedicularis_raxml/ddocent_filt-Final.snpsonly.recode.phy ddocent_filt /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n aftrrad -w /home/iovercast/manuscript-analysis/Pedicularis_raxml -s /home/iovercast/manuscript-analysis/aftrRAD/REALDATA/Formatting/SNPMatrix_17.All.fasta.phy aftrrad /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n pyrad -w /home/iovercast/manuscript-analysis/Pedicularis_raxml -s /home/iovercast/manuscript-analysis/Pedicularis_raxml/pyrad-c85d6m2p3H3N3.phy pyrad /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ipyrad -w /home/iovercast/manuscript-analysis/Pedicularis_raxml -s /home/iovercast/manuscript-analysis/Pedicularis_raxml/ipyrad-REALDATA-biallelic.recode.phy ipyrad /home/iovercast/manuscript-analysis//miniconda/bin/raxmlHPC-PTHREADS -f a -T 20 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n stacks_ungapped -w /home/iovercast/manuscript-analysis/Pedicularis_raxml -s /home/iovercast/manuscript-analysis/Pedicularis_raxml/stacks_ungapped-batch_1.phy stacks_ungapped
print(ped_trees)
{'stacks_default': '/home/iovercast/manuscript-analysis/Pedicularis_raxml/RAxML_bipartitions.stacks_default', 'pyrad': '/home/iovercast/manuscript-analysis/Pedicularis_raxml/RAxML_bipartitions.pyrad', 'aftrrad': '/home/iovercast/manuscript-analysis/Pedicularis_raxml/RAxML_bipartitions.aftrrad', 'ipyrad': '/home/iovercast/manuscript-analysis/Pedicularis_raxml/RAxML_bipartitions.ipyrad', 'ddocent_filt': '/home/iovercast/manuscript-analysis/Pedicularis_raxml/RAxML_bipartitions.ddocent_filt', 'stacks_ungapped': '/home/iovercast/manuscript-analysis/Pedicularis_raxml/RAxML_bipartitions.stacks_ungapped'}
## Get sample names and assign them to a species dict
IPYRAD_STATS=os.path.join(IPYRAD_DIR, "REALDATA/REALDATA_outfiles/REALDATA_stats.txt")
infile = open(IPYRAD_STATS).readlines()
sample_names = [x.strip().split()[0] for x in infile[20:33]]
species = set([x.split("_")[1] for x in sample_names])
species_dict = collections.OrderedDict([])
## Ordered dict of sample names and their species, in same order
## as the vcf file
for s in sample_names:
species_dict[s] = s.split("_")[1]
print(species_dict)
## Map species names to groups of individuals
#for s in species:
# species_dict[s] = [x for x in sample_names if s in x]
print(species_dict)
species_colors = {
'rex': '#FF0000',
'cyathophylla': '#008000',
'thamno': '#00FFFF',
'cyathophylloides': '#90EE90',
'przewalskii': '#FFA500',
'superba': '#8B0000',
}
emp_colors_per_sample = {}
for name, pop in species_dict.items():
emp_colors_per_sample[name + "_R1_"] = species_colors[pop]
print(emp_colors_per_sample)
samples_per_species = {}
for s in species:
samples_per_species[s] = [x + "_R1_" for x in sample_names if s in x]
print(samples_per_species)
OrderedDict([('29154_superba', 'superba'), ('30556_thamno', 'thamno'), ('30686_cyathophylla', 'cyathophylla'), ('32082_przewalskii', 'przewalskii'), ('33413_thamno', 'thamno'), ('33588_przewalskii', 'przewalskii'), ('35236_rex', 'rex'), ('35855_rex', 'rex'), ('38362_rex', 'rex'), ('39618_rex', 'rex'), ('40578_rex', 'rex'), ('41478_cyathophylloides', 'cyathophylloides'), ('41954_cyathophylloides', 'cyathophylloides')]) OrderedDict([('29154_superba', 'superba'), ('30556_thamno', 'thamno'), ('30686_cyathophylla', 'cyathophylla'), ('32082_przewalskii', 'przewalskii'), ('33413_thamno', 'thamno'), ('33588_przewalskii', 'przewalskii'), ('35236_rex', 'rex'), ('35855_rex', 'rex'), ('38362_rex', 'rex'), ('39618_rex', 'rex'), ('40578_rex', 'rex'), ('41478_cyathophylloides', 'cyathophylloides'), ('41954_cyathophylloides', 'cyathophylloides')]) {'30556_thamno_R1_': '#00FFFF', '33588_przewalskii_R1_': '#FFA500', '38362_rex_R1_': '#FF0000', '33413_thamno_R1_': '#00FFFF', '35236_rex_R1_': '#FF0000', '40578_rex_R1_': '#FF0000', '30686_cyathophylla_R1_': '#008000', '41954_cyathophylloides_R1_': '#90EE90', '41478_cyathophylloides_R1_': '#90EE90', '35855_rex_R1_': '#FF0000', '32082_przewalskii_R1_': '#FFA500', '29154_superba_R1_': '#8B0000', '39618_rex_R1_': '#FF0000'} {'rex': ['35236_rex_R1_', '35855_rex_R1_', '38362_rex_R1_', '39618_rex_R1_', '40578_rex_R1_'], 'cyathophylla': ['30686_cyathophylla_R1_'], 'thamno': ['30556_thamno_R1_', '33413_thamno_R1_'], 'cyathophylloides': ['41478_cyathophylloides_R1_', '41954_cyathophylloides_R1_'], 'przewalskii': ['32082_przewalskii_R1_', '33588_przewalskii_R1_'], 'superba': ['29154_superba_R1_']}
treefiles = ped_trees
legend_colors = [matplotlib.patches.Patch(color=v, label=k) for k,v in species_colors.items()]
## Root the tree on the WBS pop
outgroup = [{'name': sample_name} for sample_name in samples_per_species["przewalskii"]]
f, axarr = plt.subplots(6, 1, figsize=(15, 30), dpi=1000)
#axarr = [a for b in axarr for a in b]
for name, ax in zip(treefiles.keys(), axarr):
print(name, ax)
try:
tree = Phylo.read(treefiles[name], 'newick')
outgroup = [{"name":x.name} for x in tree.get_terminals() if "prze" in x.name]
if "ddocent" in name or "ipyrad" in name:
# outgroup = [{g.keys():g.values().split("_R1_")[0] for g in outgroup}]
emp_colors_per_sample = {k.split("_R1_")[0]:v for k,v in emp_colors_per_sample.items()}
tree.root_with_outgroup(*outgroup)
tree.ladderize()
## This could be cool but doesn't work
Phylo.draw(tree, axes=ax, label_colors=emp_colors_per_sample, do_show=False)
ax.set_title(name)
ax.legend(handles=legend_colors, loc="lower left")
except Exception as inst:
print("failed - {}".format(name))
print(inst)
('stacks_default', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaaf05a8c50>) ('pyrad', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaaf07b4e90>) ('aftrrad', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaaf0836250>) ('ipyrad', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaaf09dc450>) ('ddocent_filt', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaaf085f8d0>) ('stacks_ungapped', <matplotlib.axes._subplots.AxesSubplot object at 0x2aaaf0749a50>)
pzew = [{"name":x.name} for x in tree.get_terminals() if "prze" in x.name]
print(pzew)
[{'name': '33588_przewalskii_R1_'}, {'name': '32082_przewalskii_R1_'}]