## conda install ipyrad -c ipyrad
## conda install toytree -c eaton-lab
## conda install structure -c ipyrad
## conda install clumpp -c ipyrad
import toytree
import toyplot.svg
import numpy as np
import ipyrad as ip
import ipyrad.analysis as ipa
print "ipyrad v.{}".format(ip.__version__)
ipyrad v.0.7.22
import ipyparallel as ipp
ipyclient = ipp.Client()
ip.cluster_info(ipyclient)
host compute node: [40 cores] on sacra
## define k vals to test
tests = [2, 3, 4, 5, 6, 7, 8, 9, 10]
## init structure analysis object
s = ipa.structure(
name="Canarium-min30-nout",
data="analysis-ipyrad/Canarium-min30-nout_outfiles/Canarium-min30-nout.str",
mapfile="analysis-ipyrad/Canarium-min30-nout_outfiles/Canarium-min30-nout.snps.map",
)
## set mainparams for object
s.mainparams.burnin = 50000
s.mainparams.numreps = 100000
## submit batches of 20 replicate jobs for each value of K
for kpop in tests:
s.run(
kpop=kpop,
nreps=20,
seed=12345,
ipyclient=ipyclient,
)
submitted 20 structure jobs [Canarium-min30-nout-K-2] submitted 20 structure jobs [Canarium-min30-nout-K-3] submitted 20 structure jobs [Canarium-min30-nout-K-4] submitted 20 structure jobs [Canarium-min30-nout-K-5] submitted 20 structure jobs [Canarium-min30-nout-K-6] submitted 20 structure jobs [Canarium-min30-nout-K-7] submitted 20 structure jobs [Canarium-min30-nout-K-8] submitted 20 structure jobs [Canarium-min30-nout-K-9] submitted 20 structure jobs [Canarium-min30-nout-K-10]
For the high K values it seems that some of them had few or no reps that converged, so we ran an additional 20 reps for each K from 6-10 for a longer number of generations to get better convergence.
## set longer run times for additional reps on large Ks
s.mainparams.burnin = 100000
s.mainparams.numreps = 500000
moretests = [6, 7, 8, 9, 10]
## submit batches of 20 replicate jobs for each high value of K
for kpop in moretests:
s.run(
kpop=kpop,
nreps=20,
seed=12345,
ipyclient=ipyclient,
)
submitted 20 structure jobs [Canarium-min30-nout-K-6] submitted 20 structure jobs [Canarium-min30-nout-K-7] submitted 20 structure jobs [Canarium-min30-nout-K-8] submitted 20 structure jobs [Canarium-min30-nout-K-9] submitted 20 structure jobs [Canarium-min30-nout-K-10]
Below are Evanno tables calculated for several different values of the filtering parameter max_var_multiple
, which is used to exclude replicates that have a variance of LnLik N times greater than the rep with minimum variance for that value of K. We find that with max_var_multiple=100
we get approximately 20 reps in every test that passes filtering. The goal here is to exclude reps that may not have converged and therefore would otherwise lower the score for that value of K.
s.get_evanno_table(tests, max_var_multiple=0, quiet=True)
Nreps | deltaK | estLnProbMean | estLnProbStdev | lnPK | lnPPK | |
---|---|---|---|---|---|---|
2 | 20 | 0.000 | -1.537e+05 | 2.406e+03 | 0.000e+00 | 0.000e+00 |
3 | 20 | 2.029 | -1.373e+05 | 1.043e+04 | 1.638e+04 | 2.116e+04 |
4 | 20 | 1235.503 | -1.421e+05 | 1.556e+03 | -4.773e+03 | 1.923e+06 |
5 | 20 | 0.847 | -2.069e+06 | 3.035e+06 | -1.927e+06 | 2.569e+06 |
6 | 39 | 0.674 | -1.427e+06 | 3.116e+06 | 6.420e+05 | 2.099e+06 |
7 | 39 | 0.284 | -2.884e+06 | 5.253e+06 | -1.457e+06 | 1.493e+06 |
8 | 39 | 0.143 | -2.848e+06 | 5.440e+06 | 3.654e+04 | 7.805e+05 |
9 | 39 | 0.308 | -3.592e+06 | 5.755e+06 | -7.440e+05 | 1.775e+06 |
10 | 39 | 0.000 | -2.561e+06 | 3.312e+06 | 1.031e+06 | 0.000e+00 |
s.get_evanno_table(tests, max_var_multiple=10, quiet=True)
Nreps | deltaK | estLnProbMean | estLnProbStdev | lnPK | lnPPK | |
---|---|---|---|---|---|---|
2 | 20 | 0.000 | -153726.185 | 2405.508 | 0.000 | 0.000 |
3 | 20 | 2.029 | -137343.785 | 10428.473 | 16382.400 | 21155.730 |
4 | 20 | 4.618 | -142117.115 | 1556.118 | -4773.330 | 7186.853 |
5 | 13 | 0.792 | -139703.592 | 2555.662 | 2413.523 | 2025.005 |
6 | 24 | 1.780 | -139315.075 | 7703.825 | 388.517 | 13710.965 |
7 | 13 | 0.581 | -152637.523 | 36075.618 | -13322.448 | 20948.565 |
8 | 17 | 0.646 | -145011.406 | 32030.165 | 7626.117 | 20693.599 |
9 | 8 | 0.218 | -158078.888 | 35401.179 | -13067.482 | 7730.769 |
10 | 4 | 0.000 | -163415.600 | 31602.319 | -5336.712 | 0.000 |
s.get_evanno_table(tests, max_var_multiple=50, quiet=True)
Nreps | deltaK | estLnProbMean | estLnProbStdev | lnPK | lnPPK | |
---|---|---|---|---|---|---|
2 | 20 | 0.000 | -153726.185 | 2405.508 | 0.000 | 0.000 |
3 | 20 | 2.029 | -137343.785 | 10428.473 | 16382.400 | 21155.730 |
4 | 20 | 4.618 | -142117.115 | 1556.118 | -4773.330 | 7186.853 |
5 | 13 | 9.085 | -139703.592 | 2555.662 | 2413.523 | 23218.227 |
6 | 26 | 0.645 | -160508.296 | 82413.941 | -20804.704 | 53179.488 |
7 | 17 | 0.594 | -234492.488 | 168026.173 | -73984.192 | 99840.352 |
8 | 21 | 1.320 | -208636.329 | 152555.065 | 25856.160 | 201418.181 |
9 | 20 | 0.423 | -384198.350 | 230926.987 | -175562.021 | 97648.791 |
10 | 20 | 0.000 | -462111.580 | 221353.039 | -77913.230 | 0.000 |
s.get_evanno_table(tests, max_var_multiple=100, quiet=True)
Nreps | deltaK | estLnProbMean | estLnProbStdev | lnPK | lnPPK | |
---|---|---|---|---|---|---|
2 | 20 | 0.000 | -153726.185 | 2405.508 | 0.000 | 0.000 |
3 | 20 | 2.029 | -137343.785 | 10428.473 | 16382.400 | 21155.730 |
4 | 20 | 4.618 | -142117.115 | 1556.118 | -4773.330 | 7186.853 |
5 | 13 | 40.533 | -139703.592 | 2555.662 | 2413.523 | 103589.589 |
6 | 29 | 0.894 | -240879.659 | 254325.896 | -101176.066 | 227331.040 |
7 | 26 | 1.045 | -569386.765 | 506552.755 | -328507.107 | 529423.850 |
8 | 27 | 1.071 | -368470.022 | 342168.594 | 200916.743 | 366448.612 |
9 | 23 | 0.343 | -534001.891 | 450635.181 | -165531.869 | 154559.665 |
10 | 22 | 0.000 | -544974.095 | 341123.401 | -10972.204 | 0.000 |
s.get_evanno_table(tests, max_var_multiple=1000, quiet=True)
Nreps | deltaK | estLnProbMean | estLnProbStdev | lnPK | lnPPK | |
---|---|---|---|---|---|---|
2 | 20 | 0.000 | -1.537e+05 | 2.406e+03 | 0.000e+00 | 0.000e+00 |
3 | 20 | 2.029 | -1.373e+05 | 1.043e+04 | 1.638e+04 | 2.116e+04 |
4 | 20 | 983.330 | -1.421e+05 | 1.556e+03 | -4.773e+03 | 1.530e+06 |
5 | 19 | 0.967 | -1.677e+06 | 2.544e+06 | -1.535e+06 | 2.461e+06 |
6 | 37 | 2.031 | -7.510e+05 | 1.030e+06 | 9.261e+05 | 2.093e+06 |
7 | 37 | 0.413 | -1.918e+06 | 2.972e+06 | -1.167e+06 | 1.227e+06 |
8 | 37 | 0.250 | -1.858e+06 | 3.142e+06 | 5.974e+04 | 7.863e+05 |
9 | 37 | 0.199 | -2.585e+06 | 3.780e+06 | -7.265e+05 | 7.504e+05 |
10 | 39 | 0.000 | -2.561e+06 | 3.312e+06 | 2.385e+04 | 0.000e+00 |
We calculate a permuted table of results across replicate runs for each value of K while excluding reps based on the max_var_multiple parameter (see above for description).
## summarize results
s.clumppparams.m = 3 ## use largegreedy algorithm
s.clumppparams.greedy_option = 2 ## test nrepeat possible orders
s.clumppparams.repeats = 100000 ## number of repeats
qtable = s.get_clumpp_table(tests, max_var_multiple=100.)
[K2] 20/20 results permuted across replicates (max_var=100.0). [K3] 20/20 results permuted across replicates (max_var=100.0). [K4] 20/20 results permuted across replicates (max_var=100.0). [K5] 13/20 results permuted across replicates (max_var=100.0). [K6] 29/39 results permuted across replicates (max_var=100.0). [K7] 26/39 results permuted across replicates (max_var=100.0). [K8] 27/39 results permuted across replicates (max_var=100.0). [K9] 23/39 results permuted across replicates (max_var=100.0). [K10] 22/39 results permuted across replicates (max_var=100.0).
qtable = s.get_clumpp_table(tests, max_var_multiple=100.)
[K2] 20/20 results permuted across replicates (max_var=100.0). [K3] 20/20 results permuted across replicates (max_var=100.0). [K4] 20/20 results permuted across replicates (max_var=100.0). [K5] 13/20 results permuted across replicates (max_var=100.0). [K6] 29/39 results permuted across replicates (max_var=100.0). [K7] 26/39 results permuted across replicates (max_var=100.0). [K8] 27/39 results permuted across replicates (max_var=100.0). [K9] 23/39 results permuted across replicates (max_var=100.0). [K10] 22/39 results permuted across replicates (max_var=100.0).
## load a tree topology for ordering tips on barplot
tre = toytree.tree("./analysis-raxml/RAxML_bipartitions.Canarium-min20")
## root on outgroups and then drop outgroups from the tree
outs = ["D14269", "D13374", "D13852", "SFC1988"]
rtre = tre.root(outs).drop_tips(outs)
for kpop in tests:
c = toyplot.Canvas(width=550, height=110)
a = c.cartesian(margin=25)
minpos = np.linspace(0, 38, 38)
maxpos = np.linspace(0.9, 38.9, 38)
m = a.bars(minpos, maxpos,
qtable[kpop].loc[rtre.get_tip_labels()[::-1],],
color = toyplot.color.brewer.map("Set3"),
style={"stroke-width": 0.5},
)
a.show = False
## save figure
import toyplot.svg
toyplot.svg.render(c, "figures/structure-k{}.svg".format(kpop))
import toyplot
import numpy as np
## create canvas
c = toyplot.Canvas(width=600, height=160)
a = c.cartesian()
## order of colors
order = [2, 7, 0, 5, 4, 1, 6, 3][::-1]
## space between bars
minpos = np.linspace(0, 38, 38)
maxpos = np.linspace(0.9, 38.9, 38)
## plot bars
m = a.bars(minpos, maxpos,
qtable[8].loc[rtre.get_tip_labels()[::-1], order],
color = toyplot.color.brewer.map("Set3"),
style={"stroke-width": 0})
a.show = False
## save figure
import toyplot.svg
toyplot.svg.render(c, "figures/structure-final.svg")
c