The code below is a Python implementaton of R code from the "Mousies" script by Mario dos Reis for converting BPP units for theta and tau into Ne and divergence times in geological time units. https://github.com/mariodosreis/mousies/blob/master/R/analysis.R).
import numpy as np
import pandas as pd
import scipy.stats as ss
import toyplot
import glob
# get all mcmc file handles
allfiles = glob.glob("analysis-bpp/bpp00_theta_2_2000_tau_2_2000_r*.mcmc.txt")
# get headers
headers = open(allfiles[0]).readline()
# concatenate all files with header into a single file
outname = "analysis-bpp/bpp00_theta_2_2000_tau_2_2000_concat_mcmc.txt"
with open(outname, 'w') as out:
out.write(headers)
for mcmcfile in allfiles:
out.write("\n".join(open(mcmcfile).readlines()[1:]))
# load data as a dataframe
df = pd.read_table(outname)
# print n samples
df.shape
(500000, 27)
# show head
df.head()
Gen | theta_11A | theta_21B | theta_31C | theta_42A | theta_52B | theta_62C | theta_73A | theta_83B | theta_93C | ... | theta_173B3C | tau_101A1B1C2A2B2C3A3B3C | tau_111A1B1C | tau_121B1C | tau_132A2B2C3A3B3C | tau_142A2B2C | tau_152B2C | tau_163A3B3C | tau_173B3C | lnL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10 | 0.001989 | 0.000738 | 0.001069 | 0.002951 | 0.002308 | 0.005580 | 0.000913 | 0.005583 | 0.003326 | ... | 0.001435 | 0.000907 | 0.000569 | 0.000373 | 0.000835 | 0.000833 | 0.000803 | 0.000636 | 0.000379 | -11412.977 |
1 | 20 | 0.001879 | 0.000957 | 0.001187 | 0.002813 | 0.002137 | 0.005929 | 0.001139 | 0.005419 | 0.003061 | ... | 0.001689 | 0.001007 | 0.000584 | 0.000373 | 0.000926 | 0.000924 | 0.000897 | 0.000758 | 0.000368 | -11397.858 |
2 | 30 | 0.001472 | 0.000691 | 0.001157 | 0.002352 | 0.002130 | 0.005665 | 0.001222 | 0.004466 | 0.002960 | ... | 0.001656 | 0.000999 | 0.000597 | 0.000352 | 0.000904 | 0.000902 | 0.000867 | 0.000784 | 0.000357 | -11376.825 |
3 | 40 | 0.001977 | 0.000613 | 0.001028 | 0.002151 | 0.001685 | 0.005218 | 0.001071 | 0.002398 | 0.003097 | ... | 0.001602 | 0.000882 | 0.000516 | 0.000329 | 0.000803 | 0.000801 | 0.000785 | 0.000693 | 0.000291 | -11416.800 |
4 | 50 | 0.002236 | 0.000694 | 0.001163 | 0.002439 | 0.002001 | 0.005691 | 0.001211 | 0.002892 | 0.003406 | ... | 0.001850 | 0.000998 | 0.000596 | 0.000364 | 0.000909 | 0.000906 | 0.000884 | 0.000777 | 0.000352 | -11362.362 |
5 rows × 27 columns
# show all thetas
theta_names = [i for i in df.columns if "theta" in i]
df.loc[:, theta_names].columns.tolist()
['theta_11A', 'theta_21B', 'theta_31C', 'theta_42A', 'theta_52B', 'theta_62C', 'theta_73A', 'theta_83B', 'theta_93C', 'theta_101A1B1C2A2B2C3A3B3C', 'theta_111A1B1C', 'theta_121B1C', 'theta_132A2B2C3A3B3C', 'theta_142A2B2C', 'theta_152B2C', 'theta_163A3B3C', 'theta_173B3C']
# show all taus
tau_names = [i for i in df.columns if "tau" in i]
df.loc[:, tau_names].columns.tolist()
['tau_101A1B1C2A2B2C3A3B3C', 'tau_111A1B1C', 'tau_121B1C', 'tau_132A2B2C3A3B3C', 'tau_142A2B2C', 'tau_152B2C', 'tau_163A3B3C', 'tau_173B3C']
Generation time is represented by a gamma distribution with mean halfway between our estimates of 8 and 16 years.
# my designated min and max range for the generation time of Malagasy Canarium
emax = 12.
emin = 24
# mean of prior is midway between max and min
mean = (emax + emin) / 2.
# var of prior is (range / 4)**2
var = ((emax - emin) ** 2) / 16
# shape and scale parameters of the gamma dist.
a = mean ** 2 / var
b = mean / var
# get 100 values evenly spaced across this gamma dist
x = np.linspace(
ss.gamma.ppf(0.0001, a, **{'scale': 1 / b}),
ss.gamma.ppf(0.9999, a, **{'scale': 1 / b}),
100)
# plot values across range of gamma
toyplot.fill(
x,
ss.gamma.pdf(x, a, **{'scale': 1 / b}),
height=300,
width=400,
xlabel = "Generation time (years)",
ylabel = "Prob. density Gamma(g|a,b)"
);
# sample random variables from this distribution
gentime_rvs = ss.gamma.rvs(a, **{'scale': 1/b, 'random_state': 123, 'size': df.shape[0]})
Per generation mutation rate is represented by a gamma distribution with mean halfway between our estimates of 0.5 and 1.5 x 10 ^ -8.
# my designated min and max range for the per gen mutation rate of Canarium (x10-8)
emax = 0.5
emin = 1.5
# mean of prior is midway between max and min
mean = (emax + emin) / 2.
# var of prior is (range / 4)**2
var = ((emax - emin) ** 2) / 16
# shape and scale parameters of the gamma dist.
a = mean ** 2 / var
b = mean / var
# get 100 values evenly spaced across this gamma dist
x = np.linspace(
ss.gamma.ppf(0.0001, a, **{'scale': 1/b}),
ss.gamma.ppf(0.9999, a, **{'scale': 1/b}),
100)
# plot values across range of gamma
toyplot.fill(
x,
ss.gamma.pdf(x, a, **{'scale': 1/b}),
height=300,
width=400,
xlabel = "Per generation mutation rate (x 10^-8)",
ylabel = "Prob. density Gamma(g|a,b)"
);
# sample random variables from this distribution
mutrate_rvs = ss.gamma.rvs(a, **{'scale': 1/b, 'random_state': 123, 'size': df.shape[0]})
for column in df:
if "theta" in column:
_, name = column.split("_")
df["Ne_{}".format(name)] = df.loc[:, column] / ((mutrate_rvs * 1e-8) * 4)
if "tau" in column:
_, name = column.split("_")
df["div_{}".format(name)] = (df.loc[:, column] * gentime_rvs) / (mutrate_rvs * 1e-8)
# plot example transformed values for one lineage
toyplot.bars(
np.histogram(
df["Ne_11A"],
bins=30,
normed=True,
),
height=300,
width=400,
xmax=200000,
ylabel="frequency",
xlabel="Ne (effective population size)",
label="Lineage 1A"
);
# plot example transformed values for one lineage
toyplot.bars(
np.histogram(
df["div_121B1C"] / 1e6,
bins=20,
normed=True,
),
height=300,
width=400,
ylabel="frequency",
xlabel="divergence time in Mya, Lineage (1B,1C)",
label="Lineage (1B,1C)"
);
# make a nice dataframe for divergence time summary
divergence_times = (
df[[col for col in df.columns if "div" in col]]
.apply(lambda x: x / 1e6)
.rename(columns=lambda x: x[6:])
.describe()
.T
)
# save as CSV and show
divergence_times.to_csv("divtimes.csv")
divergence_times.style.format("{:.3f}")
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
1A1B1C2A2B2C3A3B3C | 500000.000 | 1.797 | 0.646 | 0.365 | 1.341 | 1.686 | 2.130 | 10.098 |
1A1B1C | 500000.000 | 1.189 | 0.504 | 0.142 | 0.832 | 1.103 | 1.450 | 6.480 |
1B1C | 500000.000 | 0.839 | 0.417 | 0.029 | 0.542 | 0.771 | 1.058 | 6.142 |
2A2B2C3A3B3C | 500000.000 | 1.717 | 0.612 | 0.359 | 1.285 | 1.613 | 2.033 | 9.725 |
2A2B2C | 500000.000 | 1.592 | 0.581 | 0.317 | 1.180 | 1.497 | 1.894 | 8.250 |
2B2C | 500000.000 | 1.440 | 0.552 | 0.179 | 1.051 | 1.353 | 1.729 | 7.671 |
3A3B3C | 500000.000 | 1.188 | 0.475 | 0.172 | 0.851 | 1.105 | 1.433 | 6.648 |
3B3C | 500000.000 | 0.784 | 0.354 | 0.038 | 0.536 | 0.726 | 0.969 | 4.123 |
# make a nice dataframe for Ne summary
nes = (
df[[col for col in df.columns if "Ne" in col]]
.rename(columns=lambda x: x.split("Ne_")[1])
.describe()
.T
)
# save as CSV and show as ints
nes.to_csv("Ne.csv")
nes.style.format("{:.0f}")
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
11A | 500000 | 47851 | 20541 | 6519 | 33324 | 43961 | 57962 | 390065 |
21B | 500000 | 24456 | 13482 | 997 | 14936 | 21603 | 30719 | 316413 |
31C | 500000 | 34596 | 16758 | 3869 | 22989 | 31116 | 42246 | 268909 |
42A | 500000 | 61369 | 24123 | 11159 | 44095 | 57117 | 73855 | 338214 |
52B | 500000 | 29358 | 12745 | 5264 | 20380 | 26732 | 35438 | 186183 |
62C | 500000 | 167167 | 60579 | 28492 | 124296 | 156978 | 198394 | 964643 |
73A | 500000 | 27145 | 14384 | 2619 | 17033 | 23701 | 33598 | 228811 |
83B | 500000 | 102886 | 40349 | 12817 | 74452 | 96084 | 123512 | 720338 |
93C | 500000 | 65524 | 29903 | 5649 | 44089 | 60904 | 81415 | 370538 |
101A1B1C2A2B2C3A3B3C | 500000 | 56577 | 22358 | 8143 | 40679 | 52866 | 68303 | 393473 |
111A1B1C | 500000 | 37927 | 22784 | 848 | 21632 | 33597 | 49187 | 286192 |
121B1C | 500000 | 42635 | 27090 | 539 | 23264 | 36907 | 55619 | 408250 |
132A2B2C3A3B3C | 500000 | 34551 | 23743 | 869 | 17742 | 28877 | 45143 | 364323 |
142A2B2C | 500000 | 30764 | 22567 | 610 | 14782 | 25346 | 40560 | 353217 |
152B2C | 500000 | 38509 | 26124 | 542 | 19977 | 32359 | 50210 | 401377 |
163A3B3C | 500000 | 44512 | 26066 | 884 | 26299 | 39188 | 56776 | 406606 |
173B3C | 500000 | 49146 | 31577 | 882 | 26503 | 42352 | 64002 | 456276 |