Canarium GBS: ABBA-BABA analyses¶

Federman et al.¶

This notebook provides all code necessary to reproduce the ABBA-BABA analyses used in Federman et al. (xxxx). All code in this notebook is written in Python.

Required software¶

In [1]:

## conda install ipyrad -c ipyrad
## conda install toytree -c eaton-lab

Imports¶

In [2]:

import toytree
import toyplot.svg
import ipyrad as ip
import ipyrad.analysis as ipa
print "ipyrad v.{}".format(ip.__version__)

ipyrad v.0.7.23

Connect to cluster¶

In [3]:

import ipyparallel as ipp
ipyclient = ipp.Client()
ip.cluster_info(ipyclient)

host compute node: [40 cores] on sacra

ABBA-BABA tests¶

In [4]:

## load input files
locifile = "./analysis-ipyrad/Canarium-min10_outfiles/Canarium-min10.loci"
newick = "./analysis-raxml/RAxML_bestTree.Canarium-min10"

In [5]:

## make clade lists for setting up tests easily
clade1A = ["SF328", "SF200", "SF175"]
clade1B = ["SF209", "D13052"]
clade1C = ["SF172", "D14528", "SF286", "SF276"]
clade1 = clade1A + clade1B + clade1C

clade2A = ["D14482", "D14483", "D13103", "D13101"]
clade2B = ["D14504", "D14505", "D14506"]
clade2C = ["D14501", "D14513", "D14480", "D14485", "D14477", "D14478"]
clade2 = clade2A + clade2B + clade2C

clade3X = ["D12963"]
clade3A = ["D13090", "D12950"]
clade3B = ["SF155", "5573", "SF327", "SF228", "SF224"]
clade3C = ["SF164", "SF153", "SF160", "D13097", "SF197", "D13075", "D13053", "D13063"]
clade3 = clade3A + clade3B + clade3C

outgroups = ["D13852", "SFC1988", "D14269", "D13374"]

Question 1: Is SF172 a hybrid?¶

In [6]:

q1 = ipa.baba(data=locifile, newick=newick)
q1.tests = [{
    "p4": outgroups,
    "p3": clade3 + clade3X, 
    "p2": clade1A + clade1B,
    "p1": ["SF172"],
}]
q1.run(ipyclient)
q1.results_table

[####################] 100%  calculating D-stats  | 0:00:38 |

Out[6]:

	dstat	bootmean	bootstd	Z	ABBA	BABA	nloci
0	-0.575	-0.575	0.014	41.321	386.765	1432.762	43331

Question 2: Admixture between clades 1 and 3¶

In [7]:

q2 = ipa.baba(data=locifile, newick=newick)
q2.tests = [{
    "p4": outgroups,
    "p3": list(set(clade1) - set(["SF172"])), 
    "p2": clade2, 
    "p1": clade3 + clade3X, 
    }]
q2.run(ipyclient)
q2.results_table

[####################] 100%  calculating D-stats  | 0:00:53 |

Out[7]:

	dstat	bootmean	bootstd	Z	ABBA	BABA	nloci
0	-0.072	-0.072	0.014	5.042	1375.145	1587.853	59670

Question 3: Admixture between clades 1 & 3 geographic¶

In [8]:

q3 = ipa.baba(data=locifile, newick=newick)
q3.tests = [
    {
    "p4": outgroups,
    "p3": list(set(clade1) - set(["SF172"])), 
    "p2": list(set(clade3C) - set(["D13097"])),
    "p1": ["D13097"], 
    },
    {
    "p4": outgroups,
    "p3": list(set(clade1) - set(["SF172"])), 
    "p2": list(set(clade3B) - set(["D13097"])),
    "p1": ["D13097"], 
    },
]
q3.run(ipyclient)
q3.results_table

[####################] 100%  calculating D-stats  | 0:00:43 |

Out[8]:

	dstat	bootmean	bootstd	Z	ABBA	BABA	nloci
0	-0.018	-0.019	0.028	0.662	380.611	394.906	50799
1	0.010	0.010	0.024	0.415	608.591	596.399	51984

Question 4: Admixture into clade 3 (3A, 3B, 3C comparison)¶

In [9]:

q4 = ipa.baba(data=locifile, newick=newick)
q4.tests = [{
    "p4": outgroups,
    "p3": list(set(clade1) - set(["SF172"])), 
    "p2": clade2,
    "p1": clade, 
    } for clade in [clade3A, clade3B, clade3C, clade3B + clade3C]]
q4.run(ipyclient)
q4.results_table

[####################] 100%  calculating D-stats  | 0:00:58 |

Out[9]:

	dstat	bootmean	bootstd	Z	ABBA	BABA	nloci
0	-0.041	-0.040	0.026	1.566	521.038	565.147	32453
1	-0.088	-0.089	0.015	5.851	1265.351	1511.058	57566
2	-0.066	-0.066	0.015	4.346	1264.598	1444.093	57496
3	-0.077	-0.077	0.014	5.539	1358.093	1584.447	59287

Question 5: Admixture between clades 2A and 2B or 2C¶

In [10]:

q5 = ipa.baba(data=locifile, newick=newick)
q5.tests = [
    {
    "p4": outgroups,
    "p3": clade2A, 
    "p2": clade2B,
    "p1": clade2C,
    }, 
    {
    "p4": outgroups,
    "p3": clade2A, 
    "p2": list(set(clade2C) - set(["D14513"])),
    "p1": ["D14513"],
    }, 
    {
    "p4": outgroups,
    "p3": clade2A, 
    "p2": list(set(clade2C) - set(["D14513", "D14501"])),
    "p1": ["D14513", "D14501"],
    }, 
    {
    "p4": outgroups, 
    "p3": clade2B, 
    "p2": ["D14483", "D14482"],
    "p1": ["D13101", "D13103"],
    },
    {
    "p4": outgroups, 
    "p3": clade2C, 
    "p2": ["D14483", "D14482"],
    "p1": ["D13101", "D13103"],
    },
]
q5.run(ipyclient)
q5.results_table

[####################] 100%  calculating D-stats  | 0:00:43 |

Out[10]:

	dstat	bootmean	bootstd	Z	ABBA	BABA	nloci
0	-0.085	-0.084	0.017	4.932	985.638	1168.777	50034
1	0.105	0.104	0.028	3.712	369.301	299.112	26788
2	0.113	0.113	0.020	5.680	620.334	494.679	36498
3	0.043	0.043	0.050	0.867	134.177	122.993	19236
4	-0.028	-0.028	0.042	0.657	141.597	149.623	20053

Question 6: Admixture between clades 2B and 2C¶

In [11]:

q6 = ipa.baba(data=locifile, newick=newick)
q6.generate_tests_from_tree(
    constraint_exact=[False, False, True, True],
    constraint_dict={
    "p4": outgroups,
    "p3": clade2B,
    "p2": clade2C,
    "p1": clade2C,
    })
q6.run(ipyclient)
q6.results_table.sort_values(by='Z', ascending=False).head(10)

50 tests generated from tree
[####################] 100%  calculating D-stats  | 0:01:44 |

Out[11]:

	dstat	bootmean	bootstd	Z	ABBA	BABA	nloci
17	-0.260	-0.261	0.021	12.352	407.642	694.587	32234
0	0.233	0.233	0.019	11.979	719.430	447.241	33055
4	0.236	0.236	0.020	11.913	716.248	442.405	32955
13	-0.236	-0.236	0.020	11.897	442.405	716.248	32955
8	0.260	0.259	0.022	11.893	694.587	407.642	32234
9	-0.233	-0.232	0.020	11.853	447.241	719.430	33055
11	-0.229	-0.229	0.020	11.595	450.432	717.972	33016
2	0.229	0.229	0.020	11.267	717.972	450.432	33016
6	0.234	0.233	0.021	11.175	706.479	438.950	32776
15	-0.234	-0.234	0.021	10.898	438.950	706.479	32776

Question 7: Admixture between 2C and 3¶

In [12]:

q7 = ipa.baba(data=locifile, newick=newick)
q7.generate_tests_from_tree(
    constraint_exact=[False, False, True, True],
    constraint_dict={
    "p4": outgroups,
    "p3": clade3B + clade3C, 
    "p2": clade2C,
    "p1": clade2C,
    })
q7.run(ipyclient)
q7.results_table.sort_values(by='Z', ascending=False).head(10)

50 tests generated from tree
[####################] 100%  calculating D-stats  | 0:02:00 |

Out[12]:

	dstat	bootmean	bootstd	Z	ABBA	BABA	nloci
1	-0.236	-0.236	0.039	6.107	149.450	241.929	22961
10	0.236	0.236	0.039	6.005	241.929	149.450	22961
26	-0.206	-0.203	0.037	5.546	154.434	234.457	23524
19	0.206	0.206	0.038	5.458	234.457	154.434	23524
24	0.156	0.157	0.029	5.354	344.367	251.328	27085
25	-0.137	-0.137	0.027	5.163	263.416	347.221	27762
30	-0.181	-0.182	0.035	5.150	180.356	260.083	24877
23	0.181	0.181	0.036	5.056	260.083	180.356	24877
27	-0.134	-0.135	0.027	5.004	263.861	345.722	27697
18	0.137	0.138	0.028	4.984	347.221	263.416	27762

Question 8: Clades 2C and 3 get top tests¶

In [13]:

qq = ipa.baba(data=locifile, newick=newick)
qq.generate_tests_from_tree(
    constraint_exact=[True, False, False, True],
    constraint_dict={
    "p4": outgroups,
    "p3": clade3B + clade3C, 
    "p2": ["D14513", "D14501"],
    "p1": ["D14485", "D14480", "D14477", "D14478"],
    })
qq.run(ipyclient)

50 tests generated from tree
[####################] 100%  calculating D-stats  | 0:01:18 |

In [14]:

## get top 5 tests 
idxs = qq.results_table.sort_values(by='Z', ascending=False).head(5).index.tolist()
qtests = [qq.tests[idx] for idx in idxs]

Make a summary plot¶

In [15]:

## top tests from q6, q7, and q8
q6s = [{
    "p4": outgroups, 
    "p3": clade2B,
    "p2": ["D14485", "D14480", "D14477", "D14478"],
    "p1": ["D14501"],
}]
q7s = [{
    "p4": outgroups, 
    "p3": clade3B + clade3C, 
    "p2": ["D14485", "D14480", "D14477", "D14478"],
    "p1": ["D14513"],
}]
q8s = [{
    "p4": outgroups, 
    "p3": clade3A,
    "p2": list(set(clade3B + clade3C) - set(["D12963"])),
    "p1": ["D12963"],
}]

In [16]:

qall = ipa.baba(data=locifile, newick=newick)
qall.tests = q1.tests + q2.tests + q3.tests + q4.tests + q5.tests +\
             q6s + q7s + q8s + qtests
qall.run(ipyclient)

[####################] 100%  calculating D-stats  | 0:01:11 |

In [45]:

neworder = [0, 1, 4, 5, 6, 7, 2, 3, 15] + range(8, 15) + range(16, 21)

In [46]:

canvas, axes, mark = qall.plot(
    pct_tree_x=0.65,
    width=900,
    height=500,
    style_tip_labels={"font-size": "11px"},
    style_results_labels={"font-size": "11px"},
    alpha=3.0,
    subset_tests=neworder
);

In [47]:

# save figure
import toyplot.svg
toyplot.svg.render(canvas, "figures/abba-baba.svg")
! cp figures/abba-baba.svg /home/deren/Dropbox/

In [48]:

# join taxon names to test results in full_table
qall.full_table = qall.results_table.copy()
for col in qall.taxon_table.columns:
    qall.full_table[col] = qall.taxon_table[col]
    
# save results table in new order, reset index, and save
fulltable = qall.full_table.loc[neworder]
fulltable.index = range(fulltable.shape[0])
fulltable.to_csv("Table_S6.csv", float_format="%.3f")

In [55]:

import numpy as np
fulltable.nloci.mean(), np.mean(fulltable.ABBA + fulltable.BABA)

Out[55]:

(35375.904761904763, 1180.7655167845865)