The data sets used in this notebook were generated with ipyrad (see notebook here). You can re-create the data sets used here by running that notebook.
Software installation (conda)
Phylogenetic analysis (raxml)
[Tree plots (toytree)](#Tree plot)
All software required for this notebook can be installed locally using conda.
## conda install toytree -c eaton-lab
## conda install ipyrad -c ipyrad
## conda install raxml -c bioconda
## import packages
import ipyrad as ip
import ipyrad.analysis as ipa
import toyplot
import toytree
import numpy as np
import glob
## print ipyrad info
print "ipyrad v.{}".format(ip.__version__)
ipyrad v.0.6.20
We will distribute jobs across an HPC cluster using the ipyparallel library (which is installed as a dependency of ipyrad). Start an ipcluster instance and check your connection below.
## print ipyparallel cluster information
import ipyparallel as ipp
ipyclient = ipp.Client()
print ip.cluster_info(ipyclient)
host compute node: [40 cores] on tinus
This is a convenient function to return only the sample names that belong to a given clade, either "pharma", "urost", "urost1" or "urost2", corresponding to major clades in our data set.
def get_subsample_names(data, clade):
## known clades
c1 = ["tonduzii", "maxima", "yoponens", "glabrata", "insipida"]
c2 = ["nymph", "obtus", "pope", "bull", "citri", "paraen",
"pertus", "perfor", "dugan", "turbin", "colub",
"costa", "tria", "trig"]
## select clades from a dict
clades={
"pharmacosycea": c1,
"americana": c2,
}
## return selected clade names
keys = data.samples.keys()
names = [i for i in keys if any([bit in i for bit in clades[clade]])]
return names
In the assembly notebook we created several data sets which include all (76) samples that passed our initial filtering, meaning they had at least N consensus reads. Here we create several additional subsets of the data for phylogenetic inference. In particular, we will remove known hybrid samples from the data, where hybrids were identified from STRUCTURE and ABBA-BABA analyses (see other notebooks).
## list of hybrid samples
pharma_hybrids = [
"B131_glabrataXmaxima",
"A96_glabrata",
"B123_maxima",
"C18_maxima",
]
america_hybrids = [
"C31_triangle",
"C30_triangle",
"A55_triangle",
"C11_costaricana",
"A87_costaricana",
"C04_colubrinae",
]
## parent from which branches will be made
parent = ip.load_json("analysis-ipyrad/ficus_dhi_s4.json")
## new pharma subsample no-hybrids branches
subs = list(set(get_subsample_names(parent, "pharmacosycea")) - set(pharma_hybrids))
pharma = parent.branch("pharma_dhi_s10_nohybrids", subsamples=subs)
pharma.set_params("min_samples_locus", 10)
pharma.run("7", force=True)
## new america subsample no-hybrids branches
subs = list(set(get_subsample_names(parent, "americana")) - set(america_hybrids))
america = parent.branch("america_dhi_s20_nohybrids", subsamples=subs)
america.set_params("min_samples_locus", 20)
america.run("7", force=True)
loading Assembly: ficus_dhi_s4 from saved path: ~/Documents/Ficus/analysis-ipyrad/ficus_dhi_s4.json Assembly: pharma_dhi_s10_nohybrids [####################] 100% filtering loci | 0:00:14 | s7 | [####################] 100% building loci/stats | 0:00:38 | s7 | [####################] 100% building alleles | 0:00:43 | s7 | [####################] 100% building vcf file | 0:00:52 | s7 | [####################] 100% writing vcf file | 0:00:00 | s7 | [####################] 100% building arrays | 0:00:12 | s7 | [####################] 100% writing outfiles | 0:00:09 | s7 | Outfiles written to: ~/Documents/Ficus/analysis-ipyrad/pharma_dhi_s10_nohybrids_outfiles Assembly: america_dhi_s20_nohybrids [####################] 100% filtering loci | 0:00:33 | s7 | [####################] 100% building loci/stats | 0:00:37 | s7 | [####################] 100% building alleles | 0:00:44 | s7 | [####################] 100% building vcf file | 0:00:59 | s7 | [####################] 100% writing vcf file | 0:00:00 | s7 | [####################] 100% building arrays | 0:00:25 | s7 | [####################] 100% writing outfiles | 0:00:14 | s7 | Outfiles written to: ~/Documents/Ficus/analysis-ipyrad/america_dhi_s20_nohybrids_outfiles
RAxML
¶(Maximum Likelihood & rapid bootstrapping)
## data sets for which we want to infer an ML tree
keys = [
"ficus_dhi_s20", # high depth dsets
"ficus_dhi_s10", #
"ficus_dhi_s4", #
"pharma_dhi_s10_nohybrids", # no hybrid samples
"america_dhi_s20_nohybrids", #
"ficus_dlo_s4", # most possible data
]
## a dictionary of raxml objects
raxdict = {}
## iterate over keys (data sets)
for key in keys:
data = dsets[key]
## set outgroup
if "ficus" in key:
out = get_sample_names(data, "pharma")
else:
out = None
## init a raxml object
raxdict[key] = ipa.raxml(
name=data.name,
data=data.outfiles.phy,
o=out,
N=100,
T=10)
## print the last rax command for prosperities sake
raxdict[key].command
'raxmlHPC-PTHREADS-AVX -f a -T 10 -m GTRGAMMA -N 100 -x 12345 -p 54321 -n ficus_dlo_s4 -w /home/deren/Documents/Ficus/analysis-raxml -s /home/deren/Documents/Ficus/analysis-ipyrad/ficus_dlo_s4_outfiles/ficus_dlo_s4.phy -o C17_maxima,C18_maxima,B123_maxima,B130_glabrataXmaxima,B133_glabrata,C47_yoponensis,C48_tonduzii,B119_maxima,B128_insipida,B126_insipida,C46_yoponensis,B127_insipida,C15_insipida,C45_yoponensis,C50_insipida,A95_insipida,B131_glabrataXmaxima,B134_glabrata,A94_maxima,B118_maxima,B120_maxima,A96_glabrata,A97_glabrata'
## call raxml on each data set (command above)
for rax in raxdict:
raxdict[rax].run()
## draw the pruned "urostigma" tree
tre = toytree.tree("analysis-raxml/RAxML_bipartitions.ficus_dhi_s10")
#tre.tree.prune(get_subsample_names(parent, 'americana'))
c, a = tre.draw(
width=400,
height=900,
node_labels=tre.get_node_values("support"),
use_edge_lengths=True,
tip_labels_align=True,
);
toyplot.html.render(c, "figures/ficus_dhi_s10.html")
c
## draw the pruned "urostigma" tree
tre = toytree.tree("analysis-raxml/RAxML_bipartitions.ficus_dhi_s10")
tre.tree.prune(get_sample_names(parent, 'urost'))
## colormap support values
palette = toyplot.color.brewer.map(
name="Greys", domain_min=50, domain_max=100, reverse=True)
vals = np.array(tre.get_node_values("support", 1, 1))
colors = toyplot.color.broadcast((vals, palette), shape=vals.shape)
colors = colors.tolist()
## draw tree
tre.draw(
width=400,
height=700,
node_labels=True, #tre.get_node_values("support"),
node_color=colors,
node_style={"stroke": "#262626"},
node_size=[12 if i else 0 for i in tre.get_node_values("support", 1, 0)],
use_edge_lengths=True,
tip_labels_align=True,
);