## conda install ipyrad -c ipyrad
## conda install toytree -c eaton-lab
import toytree
import toyplot.svg
import ipyrad as ip
import ipyrad.analysis as ipa
print "ipyrad v.{}".format(ip.__version__)
ipyrad v.0.7.23
import ipyparallel as ipp
ipyclient = ipp.Client()
ip.cluster_info(ipyclient)
host compute node: [40 cores] on sacra
## load input files
locifile = "./analysis-ipyrad/Canarium-min10_outfiles/Canarium-min10.loci"
newick = "./analysis-raxml/RAxML_bestTree.Canarium-min10"
## make clade lists for setting up tests easily
clade1A = ["SF328", "SF200", "SF175"]
clade1B = ["SF209", "D13052"]
clade1C = ["SF172", "D14528", "SF286", "SF276"]
clade1 = clade1A + clade1B + clade1C
clade2A = ["D14482", "D14483", "D13103", "D13101"]
clade2B = ["D14504", "D14505", "D14506"]
clade2C = ["D14501", "D14513", "D14480", "D14485", "D14477", "D14478"]
clade2 = clade2A + clade2B + clade2C
clade3X = ["D12963"]
clade3A = ["D13090", "D12950"]
clade3B = ["SF155", "5573", "SF327", "SF228", "SF224"]
clade3C = ["SF164", "SF153", "SF160", "D13097", "SF197", "D13075", "D13053", "D13063"]
clade3 = clade3A + clade3B + clade3C
outgroups = ["D13852", "SFC1988", "D14269", "D13374"]
q1 = ipa.baba(data=locifile, newick=newick)
q1.tests = [{
"p4": outgroups,
"p3": clade3 + clade3X,
"p2": clade1A + clade1B,
"p1": ["SF172"],
}]
q1.run(ipyclient)
q1.results_table
[####################] 100% calculating D-stats | 0:00:38 |
dstat | bootmean | bootstd | Z | ABBA | BABA | nloci | |
---|---|---|---|---|---|---|---|
0 | -0.575 | -0.575 | 0.014 | 41.321 | 386.765 | 1432.762 | 43331 |
q2 = ipa.baba(data=locifile, newick=newick)
q2.tests = [{
"p4": outgroups,
"p3": list(set(clade1) - set(["SF172"])),
"p2": clade2,
"p1": clade3 + clade3X,
}]
q2.run(ipyclient)
q2.results_table
[####################] 100% calculating D-stats | 0:00:53 |
dstat | bootmean | bootstd | Z | ABBA | BABA | nloci | |
---|---|---|---|---|---|---|---|
0 | -0.072 | -0.072 | 0.014 | 5.042 | 1375.145 | 1587.853 | 59670 |
q3 = ipa.baba(data=locifile, newick=newick)
q3.tests = [
{
"p4": outgroups,
"p3": list(set(clade1) - set(["SF172"])),
"p2": list(set(clade3C) - set(["D13097"])),
"p1": ["D13097"],
},
{
"p4": outgroups,
"p3": list(set(clade1) - set(["SF172"])),
"p2": list(set(clade3B) - set(["D13097"])),
"p1": ["D13097"],
},
]
q3.run(ipyclient)
q3.results_table
[####################] 100% calculating D-stats | 0:00:43 |
dstat | bootmean | bootstd | Z | ABBA | BABA | nloci | |
---|---|---|---|---|---|---|---|
0 | -0.018 | -0.019 | 0.028 | 0.662 | 380.611 | 394.906 | 50799 |
1 | 0.010 | 0.010 | 0.024 | 0.415 | 608.591 | 596.399 | 51984 |
q4 = ipa.baba(data=locifile, newick=newick)
q4.tests = [{
"p4": outgroups,
"p3": list(set(clade1) - set(["SF172"])),
"p2": clade2,
"p1": clade,
} for clade in [clade3A, clade3B, clade3C, clade3B + clade3C]]
q4.run(ipyclient)
q4.results_table
[####################] 100% calculating D-stats | 0:00:58 |
dstat | bootmean | bootstd | Z | ABBA | BABA | nloci | |
---|---|---|---|---|---|---|---|
0 | -0.041 | -0.040 | 0.026 | 1.566 | 521.038 | 565.147 | 32453 |
1 | -0.088 | -0.089 | 0.015 | 5.851 | 1265.351 | 1511.058 | 57566 |
2 | -0.066 | -0.066 | 0.015 | 4.346 | 1264.598 | 1444.093 | 57496 |
3 | -0.077 | -0.077 | 0.014 | 5.539 | 1358.093 | 1584.447 | 59287 |
q5 = ipa.baba(data=locifile, newick=newick)
q5.tests = [
{
"p4": outgroups,
"p3": clade2A,
"p2": clade2B,
"p1": clade2C,
},
{
"p4": outgroups,
"p3": clade2A,
"p2": list(set(clade2C) - set(["D14513"])),
"p1": ["D14513"],
},
{
"p4": outgroups,
"p3": clade2A,
"p2": list(set(clade2C) - set(["D14513", "D14501"])),
"p1": ["D14513", "D14501"],
},
{
"p4": outgroups,
"p3": clade2B,
"p2": ["D14483", "D14482"],
"p1": ["D13101", "D13103"],
},
{
"p4": outgroups,
"p3": clade2C,
"p2": ["D14483", "D14482"],
"p1": ["D13101", "D13103"],
},
]
q5.run(ipyclient)
q5.results_table
[####################] 100% calculating D-stats | 0:00:43 |
dstat | bootmean | bootstd | Z | ABBA | BABA | nloci | |
---|---|---|---|---|---|---|---|
0 | -0.085 | -0.084 | 0.017 | 4.932 | 985.638 | 1168.777 | 50034 |
1 | 0.105 | 0.104 | 0.028 | 3.712 | 369.301 | 299.112 | 26788 |
2 | 0.113 | 0.113 | 0.020 | 5.680 | 620.334 | 494.679 | 36498 |
3 | 0.043 | 0.043 | 0.050 | 0.867 | 134.177 | 122.993 | 19236 |
4 | -0.028 | -0.028 | 0.042 | 0.657 | 141.597 | 149.623 | 20053 |
q6 = ipa.baba(data=locifile, newick=newick)
q6.generate_tests_from_tree(
constraint_exact=[False, False, True, True],
constraint_dict={
"p4": outgroups,
"p3": clade2B,
"p2": clade2C,
"p1": clade2C,
})
q6.run(ipyclient)
q6.results_table.sort_values(by='Z', ascending=False).head(10)
50 tests generated from tree [####################] 100% calculating D-stats | 0:01:44 |
dstat | bootmean | bootstd | Z | ABBA | BABA | nloci | |
---|---|---|---|---|---|---|---|
17 | -0.260 | -0.261 | 0.021 | 12.352 | 407.642 | 694.587 | 32234 |
0 | 0.233 | 0.233 | 0.019 | 11.979 | 719.430 | 447.241 | 33055 |
4 | 0.236 | 0.236 | 0.020 | 11.913 | 716.248 | 442.405 | 32955 |
13 | -0.236 | -0.236 | 0.020 | 11.897 | 442.405 | 716.248 | 32955 |
8 | 0.260 | 0.259 | 0.022 | 11.893 | 694.587 | 407.642 | 32234 |
9 | -0.233 | -0.232 | 0.020 | 11.853 | 447.241 | 719.430 | 33055 |
11 | -0.229 | -0.229 | 0.020 | 11.595 | 450.432 | 717.972 | 33016 |
2 | 0.229 | 0.229 | 0.020 | 11.267 | 717.972 | 450.432 | 33016 |
6 | 0.234 | 0.233 | 0.021 | 11.175 | 706.479 | 438.950 | 32776 |
15 | -0.234 | -0.234 | 0.021 | 10.898 | 438.950 | 706.479 | 32776 |
q7 = ipa.baba(data=locifile, newick=newick)
q7.generate_tests_from_tree(
constraint_exact=[False, False, True, True],
constraint_dict={
"p4": outgroups,
"p3": clade3B + clade3C,
"p2": clade2C,
"p1": clade2C,
})
q7.run(ipyclient)
q7.results_table.sort_values(by='Z', ascending=False).head(10)
50 tests generated from tree [####################] 100% calculating D-stats | 0:02:00 |
dstat | bootmean | bootstd | Z | ABBA | BABA | nloci | |
---|---|---|---|---|---|---|---|
1 | -0.236 | -0.236 | 0.039 | 6.107 | 149.450 | 241.929 | 22961 |
10 | 0.236 | 0.236 | 0.039 | 6.005 | 241.929 | 149.450 | 22961 |
26 | -0.206 | -0.203 | 0.037 | 5.546 | 154.434 | 234.457 | 23524 |
19 | 0.206 | 0.206 | 0.038 | 5.458 | 234.457 | 154.434 | 23524 |
24 | 0.156 | 0.157 | 0.029 | 5.354 | 344.367 | 251.328 | 27085 |
25 | -0.137 | -0.137 | 0.027 | 5.163 | 263.416 | 347.221 | 27762 |
30 | -0.181 | -0.182 | 0.035 | 5.150 | 180.356 | 260.083 | 24877 |
23 | 0.181 | 0.181 | 0.036 | 5.056 | 260.083 | 180.356 | 24877 |
27 | -0.134 | -0.135 | 0.027 | 5.004 | 263.861 | 345.722 | 27697 |
18 | 0.137 | 0.138 | 0.028 | 4.984 | 347.221 | 263.416 | 27762 |
qq = ipa.baba(data=locifile, newick=newick)
qq.generate_tests_from_tree(
constraint_exact=[True, False, False, True],
constraint_dict={
"p4": outgroups,
"p3": clade3B + clade3C,
"p2": ["D14513", "D14501"],
"p1": ["D14485", "D14480", "D14477", "D14478"],
})
qq.run(ipyclient)
50 tests generated from tree [####################] 100% calculating D-stats | 0:01:18 |
## get top 5 tests
idxs = qq.results_table.sort_values(by='Z', ascending=False).head(5).index.tolist()
qtests = [qq.tests[idx] for idx in idxs]
## top tests from q6, q7, and q8
q6s = [{
"p4": outgroups,
"p3": clade2B,
"p2": ["D14485", "D14480", "D14477", "D14478"],
"p1": ["D14501"],
}]
q7s = [{
"p4": outgroups,
"p3": clade3B + clade3C,
"p2": ["D14485", "D14480", "D14477", "D14478"],
"p1": ["D14513"],
}]
q8s = [{
"p4": outgroups,
"p3": clade3A,
"p2": list(set(clade3B + clade3C) - set(["D12963"])),
"p1": ["D12963"],
}]
qall = ipa.baba(data=locifile, newick=newick)
qall.tests = q1.tests + q2.tests + q3.tests + q4.tests + q5.tests +\
q6s + q7s + q8s + qtests
qall.run(ipyclient)
[####################] 100% calculating D-stats | 0:01:11 |
neworder = [0, 1, 4, 5, 6, 7, 2, 3, 15] + range(8, 15) + range(16, 21)
canvas, axes, mark = qall.plot(
pct_tree_x=0.65,
width=900,
height=500,
style_tip_labels={"font-size": "11px"},
style_results_labels={"font-size": "11px"},
alpha=3.0,
subset_tests=neworder
);
# save figure
import toyplot.svg
toyplot.svg.render(canvas, "figures/abba-baba.svg")
! cp figures/abba-baba.svg /home/deren/Dropbox/
# join taxon names to test results in full_table
qall.full_table = qall.results_table.copy()
for col in qall.taxon_table.columns:
qall.full_table[col] = qall.taxon_table[col]
# save results table in new order, reset index, and save
fulltable = qall.full_table.loc[neworder]
fulltable.index = range(fulltable.shape[0])
fulltable.to_csv("Table_S6.csv", float_format="%.3f")
import numpy as np
fulltable.nloci.mean(), np.mean(fulltable.ABBA + fulltable.BABA)
(35375.904761904763, 1180.7655167845865)