I simulated two trees to work with. One that is completely imbalanced (ladder-like) and one that is balanced (equal number tips descended from each node). I'm using the Python package ete2
for most of the tree manipulations. This notebook was run in Python 2.7. You will also need the package rpy2 installed, as well as a working version of R with the package 'ape' to make tree plots later in the notebook.
## standard Python imports
import glob
import itertools
from collections import OrderedDict, Counter
## extra Python imports
import rpy2 ## required for tree plotting
import ete2 ## used for tree manipulation
import egglib ## used for coalescent simulations
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
## print versions
for pkg in [matplotlib, np, pd, ete2, rpy2]:
print "{:<10}\t{:<10}".\
format(pkg.__name__, pkg.__version__)
matplotlib 1.5.1 numpy 1.11.0 pandas 0.18.0 ete2 2.3.10 rpy2 2.8.2
I wrote a program to simulate RAD-seq like sequence data which uses the python package egglib for coalescent simulations. Below will check that you have the relevant software installed. See here for simrrls installation: https://github.com/dereneaton/simrrls
## check simrrls package and requirements
import egglib
import simrrls
## print versions
print 'egglib', egglib.version
print 'simrrls', simrrls.__version__
egglib 2.1.11 simrrls 0.0.11
## base tree
Tbal = ete2.Tree()
## branch lengths
bls = 1.
## namer
n = iter(('s'+str(i) for i in xrange(1,1500)))
## first nodes
n1 = Tbal.add_child(name=n.next(), dist=bls)
n2 = Tbal.add_child(name=n.next(), dist=bls)
## make balanced tree
while len(Tbal.get_leaves()) < 64:
thisrep = Tbal.get_descendants()
for node in thisrep:
if len(node.get_children()) < 1:
node.add_child(name=n.next(), dist=bls)
node.add_child(name=n.next(), dist=bls)
## Save newick string to file
Tbal.write(outfile="Tbal.tre", format=3)
String representation
## newick string
! cat Tbal.tre
((((((s63:1,s64:1)s31:1,(s65:1,s66:1)s32:1)s15:1,((s67:1,s68:1)s33:1,(s69:1,s70:1)s34:1)s16:1)s7:1,(((s71:1,s72:1)s35:1,(s73:1,s74:1)s36:1)s17:1,((s75:1,s76:1)s37:1,(s77:1,s78:1)s38:1)s18:1)s8:1)s3:1,((((s79:1,s80:1)s39:1,(s81:1,s82:1)s40:1)s19:1,((s83:1,s84:1)s41:1,(s85:1,s86:1)s42:1)s20:1)s9:1,(((s87:1,s88:1)s43:1,(s89:1,s90:1)s44:1)s21:1,((s91:1,s92:1)s45:1,(s93:1,s94:1)s46:1)s22:1)s10:1)s4:1)s1:1,(((((s95:1,s96:1)s47:1,(s97:1,s98:1)s48:1)s23:1,((s99:1,s100:1)s49:1,(s101:1,s102:1)s50:1)s24:1)s11:1,(((s103:1,s104:1)s51:1,(s105:1,s106:1)s52:1)s25:1,((s107:1,s108:1)s53:1,(s109:1,s110:1)s54:1)s26:1)s12:1)s5:1,((((s111:1,s112:1)s55:1,(s113:1,s114:1)s56:1)s27:1,((s115:1,s116:1)s57:1,(s117:1,s118:1)s58:1)s28:1)s13:1,(((s119:1,s120:1)s59:1,(s121:1,s122:1)s60:1)s29:1,((s123:1,s124:1)s61:1,(s125:1,s126:1)s62:1)s30:1)s14:1)s6:1)s2:1);
## show tree, remove node circles
#for node in Tbal.traverse():
# node.img_style["size"] = 0
#Tbal.render("%%inline", h=500)
## base tree
Timb = ete2.Tree()
## namer
n = iter(('s'+str(i) for i in range(1,5000)))
## scale branches to match balanced treelength
brlen = (bls*6.)/63
## first nodes
n1 = Timb.add_child(name=n.next(), dist=brlen)
n2 = Timb.add_child(name=n.next(), dist=brlen)
while len(Timb.get_leaves()) < 64:
## extend others
for tip in Timb.get_leaves()[:-1]:
tip.dist += brlen
## extend the last node
Timb.get_leaves()[-1].add_child(name=n.next(), dist=brlen)
Timb.get_leaves()[-1].add_sister(name=n.next(), dist=brlen)
## write to file
Timb.write(outfile="Timb.tre", format=3)
Or copy the following string to a file:
! cat Timb.tre
(s1:6,(s3:5.90476,(s5:5.80952,(s7:5.71429,(s9:5.61905,(s11:5.52381,(s13:5.42857,(s15:5.33333,(s17:5.2381,(s19:5.14286,(s21:5.04762,(s23:4.95238,(s25:4.85714,(s27:4.7619,(s29:4.66667,(s31:4.57143,(s33:4.47619,(s35:4.38095,(s37:4.28571,(s39:4.19048,(s41:4.09524,(s43:4,(s45:3.90476,(s47:3.80952,(s49:3.71429,(s51:3.61905,(s53:3.52381,(s55:3.42857,(s57:3.33333,(s59:3.2381,(s61:3.14286,(s63:3.04762,(s65:2.95238,(s67:2.85714,(s69:2.7619,(s71:2.66667,(s73:2.57143,(s75:2.47619,(s77:2.38095,(s79:2.28571,(s81:2.19048,(s83:2.09524,(s85:2,(s87:1.90476,(s89:1.80952,(s91:1.71429,(s93:1.61905,(s95:1.52381,(s97:1.42857,(s99:1.33333,(s101:1.2381,(s103:1.14286,(s105:1.04762,(s107:0.952381,(s109:0.857143,(s111:0.761905,(s113:0.666667,(s115:0.571429,(s117:0.47619,(s119:0.380952,(s121:0.285714,(s123:0.190476,(s125:0.0952381,s126:0.0952381)s124:0.0952381)s122:0.0952381)s120:0.0952381)s118:0.0952381)s116:0.0952381)s114:0.0952381)s112:0.0952381)s110:0.0952381)s108:0.0952381)s106:0.0952381)s104:0.0952381)s102:0.0952381)s100:0.0952381)s98:0.0952381)s96:0.0952381)s94:0.0952381)s92:0.0952381)s90:0.0952381)s88:0.0952381)s86:0.0952381)s84:0.0952381)s82:0.0952381)s80:0.0952381)s78:0.0952381)s76:0.0952381)s74:0.0952381)s72:0.0952381)s70:0.0952381)s68:0.0952381)s66:0.0952381)s64:0.0952381)s62:0.0952381)s60:0.0952381)s58:0.0952381)s56:0.0952381)s54:0.0952381)s52:0.0952381)s50:0.0952381)s48:0.0952381)s46:0.0952381)s44:0.0952381)s42:0.0952381)s40:0.0952381)s38:0.0952381)s36:0.0952381)s34:0.0952381)s32:0.0952381)s30:0.0952381)s28:0.0952381)s26:0.0952381)s24:0.0952381)s22:0.0952381)s20:0.0952381)s18:0.0952381)s16:0.0952381)s14:0.0952381)s12:0.0952381)s10:0.0952381)s8:0.0952381)s6:0.0952381)s4:0.0952381)s2:0.0952381);
## show tree, remove node circles
#for node in Timb.traverse():
# node.img_style["size"] = 0
#Timb.render("%%inline", h=500)
Check that the trees are the same length (close enough).
print set([i.get_distance(Tbal) for i in Tbal]), 'treelength'
print len(Tbal), 'tips'
print set([i.get_distance(Timb) for i in Timb]), 'treelength'
print len(Timb), 'tips'
set([6.0]) treelength 64 tips set([5.999999999999996]) treelength 64 tips
This tree is inferred in notebook 3, and here it is scaled with penalized likelihood to be ultrametric.
%load_ext rpy2.ipython
%%R -w 400 -h 600
library(ape)
## make tree ultrametric using penalized likelihood
Vtree <- read.tree("~/Dropbox/RAxML_bestTree.VIB_small_c85d6m4p99")
Utree <- drop.tip(Vtree, "clemensiae_DRY6_PWS_2135")
Utree <- ladderize(chronopl(Utree, 0.5))
## multiply bls so tree length=6 after dropping outgroup
Utree$edge.length <- Utree$edge.length*6
## save the new tree
write.tree(Utree, "Tvib.tre")
plot(Utree, cex=0.7, edge.width=2)
add.scale.bar()
#edgelabels(round(Utree$edge.length,3))
#### load TVib tree into Python and print newick string
Tvib = ete2.Tree("Tvib.tre")
! cat Tvib.tre
(((((((((((((anamensis_C6_PWS_2094:0.7975737351,sempervirens_combined:0.7975737351):0.38788117,setigerum:1.185454905):0.1159036148,((tashiori_D30_TET_YAH:0.9508943617,luzonicum_D27_9M_2005:0.9508943617):0.2224344758,erosum_D23_MJDJP_4:1.173328837):0.1280296823):0.1966874261,((parvifolium_D29_KFC_1953:0.9280851365,dilitatum_ELS45:0.9280851365):0.3392240969,wrightii_D31_MJDJP_1:1.267309233):0.2307367125):0.1683708168,japonicum_D26_WC_273:1.666416763):0.2285748893,integrifolium_D25_KFC_1946:1.894991652):0.8552902281,((betulifolium:1.406180911,muhalla_D28_WC_274:1.406180911):0.2814444291,foetidum_D24_KFC_1942:1.68762534):1.06265654):0.4191560518,(acerfolium_ELS88:2.103542008,orientale_DRY2_MJD_GEORGIA:2.103542008):1.065895924):0.2536372853,((((glaberrimum_D34_PWS_2323:1.145733296,vernicosum_D21_PWS_2123:1.145733296):0.4959207291,beccarii_combinedX:1.641654025):0.6376473347,sambucina_D20_PWS_2100:2.27930136):0.3829119791,(cylindricum_DRY1_WC_268:0.8292472406,coriaceum_combined:0.8292472406):1.832966098):0.7608618785):0.8509283234,(sargentii_RCW19:1.145837138,opulus_D6_WC_250:1.145837138):3.128166403):0.1755491402,(((acutifolium_DRY3_MEX_006:1.055259441,sulcatum_D9_MEX_003:1.055259441):0.3022564037,(triphyllum_D13_PWS_1783:0.6956223643,jamesonii_D12_PWS_1636:0.6956223643):0.6618934804):0.7545581918,(dentatum_ELS4:1.379325329,recognitum_AA_1471_83B:1.379325329):0.7327487076):2.337478644):0.1834757395,(((cinnamomifolium_PWS2105X:0.9443909619,propinquum_DRY4_WC_276:0.9443909619):0.1859467029,davidii_D32_WC_269:1.130337665):1.715940815,tinus_D33_WC_277:2.84627848):1.78674994):1.36697158,(((((((prunifolium_ELS57:0.4605954259,prunifolium_AA_22586A:0.4605954259):0.4436690703,rufidulum_ELS25:0.9042644962):0.525530087,lentago_ELS85:1.429794583):2.029772371,(cassinoides_ELS2:3.287159629,punctatum_D19_PWS_2097:3.287159629):0.1724073248):0.6538743879,(((veitchii:0.6523403244,rhytidophyllum:0.6523403244):1.024743851,lantan_combined:1.677084175):0.4005599057,((carlesii_D1_BP_001:0.4336007225,bitchuense_combined:0.4336007225):1.376780113,macrocephalum_D2_WC_284:1.810380836):0.2672632448):2.035797261):1.439283941,((((((((foetens_ERAD10:0.8212355504,grandiflorum_ERAD11_Wendy:0.8212355504):0.3328646861,farreri_RCW21:1.154100236):0.4228107657,suspensum_C5_MJD_111711:1.576911002):0.8113023509,erubescens_RCW36:2.388213353):0.3038880768,henryi_D22_WC_272:2.69210143):0.4106409896,(sieboldii_AA_616_6B:2.627864544,odoratissimum_combined:2.627864544):0.4748778756):0.7034334346,((plicatum_C1_MJDJP_12:1.677317667,hanceanum_D4_PWS_2195:1.677317667):0.4205302257,lutescens_D35_PWS_2077:2.097847893):1.708327961):0.9654371248,amplificatum_D3_SAN_156003:4.771612979):0.7811123034):0.1965029727,((((furcatum_combined:1.183437734,sympodiale_D18_KFC_1932:1.183437734):0.554269789,lantanoides_D15_Beartown_2:1.737707523):0.3948362082,nervosum_C4_PWS_2298:2.132543731):3.301651155,(taiwanianum_TW1_KFC_1952:1.319017626,urceolatum_MJD_Japan_8:1.319017626):4.11517726):0.3150333693):0.2507717451);
Here I use the simrrls program to simulate RADseq data on each input topology with locus dropout occurring with respect to phylogenetic distances. Find simrrls in my github profile.
%%bash
## balanced tree
mkdir -p Tbal_rad_drop/
mkdir -p Tbal_ddrad_drop/
mkdir -p Tbal_rad_covfull/
mkdir -p Tbal_rad_covlow/
mkdir -p Tbal_rad_covmed/
## imbalanced tree
mkdir -p Timb_rad_drop/
mkdir -p Timb_ddrad_drop/
mkdir -p Timb_rad_covfull/
mkdir -p Timb_rad_covlow/
mkdir -p Timb_rad_covmed/
## sims on empirical Viburnum topo
mkdir -p Tvib_rad_drop/
mkdir -p Tvib_ddrad_drop/
mkdir -p Tvib_rad_covfull/
mkdir -p Tvib_rad_covlow/
mkdir -p Tvib_rad_covmed
%%bash
simrrls -h
usage: simrrls [-h] [--version] [-o outname] [-mc dropout] [-ms dropout] [-e error] [-f datatype] [-I indels] [-l length] [-L nLoci] [-n Ninds] [-N Ne] [-t tree] [-u mu] [-df depthfunc] [-dm depthmean] [-ds depthstd] [-c1 cut_1] [-c2 cut_2] [-i1 min_insert] [-i2 max_insert] [-r1 seed_1] [-r2 seed_2] optional arguments: -h, --help show this help message and exit --version show program's version number and exit -o outname [str] output file name prefix (default 'out') -mc dropout [0/1] allelic dropout from mutation to cut sites (default 0) -ms dropout [0/1] allelic dropout from new cut sites in seq (default 0) -e error [float] sequencing error rate (default 0.0005) -f datatype [str] datatype (default rad) (options: rad, gbs, ddrad, pairddrad, pairgbs) -I indels [float] rate of indel mutations (default 0) ex: 0.001 -l length [int] length of simulated sequences (default 100) -L nLoci [int] number of loci to simulate (default 100) -n Ninds [int] N individuals from each taxon (default 1) -N Ne [int] pop size (Ne for all lineages; default 5e5) -t tree [str] file name or newick string of ultrametric tree (default 12 taxon balanced tree w/ bls=1) -u mu [float] per site mutation rate (default 1e-9) -df depthfunc [str] model for sampling copies (default norm, other=exp) -dm depthmean [int] mean sampled copies in norm, 1/m for exp (default 10) -ds depthstd [int] stdev sampled copies, used with norm model (default 0) -c1 cut_1 [str] restriction site 1 (default CTGCAG) -c2 cut_2 [str] restriction site 1 (default CCGG) -i1 min_insert [int] total frag len = (2*l)+insert (default 200) -i2 max_insert [int] total frag len = (2*l)+insert (default 400) -r1 seed_1 [int] random seed 1 (default 1234567) -r2 seed_2 [int] random seed 2 (default 7654321) Example usage: simrrls -o test1 -N 1e6 -n 4 -L 1000 simrrls -o test2 -f ddrad -c1 CCTGCAGG -c2 CCGG simrrls -o test3 -f pairgbs -c1 CTGCAG -D 1 -i1 100 -i2 400 echo "((a:1,b:1):1,c:2);" > treefile simrrls -o test4 -t treefile -df norm -dm 5 -ds 1
Here I simulate 1000 loci on each tree. For each tree data are simulated in 5 ways. With and without data loss from mutation-disruption or low sequencing coverage, and as a rad data type (one cutter) and ddrad (two cutters). This will take about 10 minutes to run probably.
%%bash
for tree in Tbal Timb Tvib;
do
simrrls -mc 1 -ms 1 -t $tree.tre \
-L 1000 -l 100 \
-u 1e-9 -N 1e6 \
-f rad -c1 CTGCAG \
-o $tree\_rad_drop/$tree
simrrls -mc 1 -ms 1 -t $tree.tre \
-L 1000 -l 100 \
-u 1e-9 -N 1e6 \
-f ddrad -c1 CTGCAG -c2 AATT \
-o $tree\_ddrad_drop/$tree
simrrls -mc 0 -ms 0 -t $tree.tre \
-L 1000 -l 100 \
-u 1e-9 -N 1e6 \
-f rad -c1 CTGCAG \
-o $tree\_rad_covfull/$tree
simrrls -mc 0 -ms 0 -t $tree.tre \
-L 1000 -l 100 \
-u 1e-9 -N 1e6 \
-f rad -c1 CTGCAG \
-dm 5 -ds 5 \
-o $tree\_rad_covmed/$tree
simrrls -mc 0 -ms 0 -t $tree.tre \
-L 1000 -l 100 \
-u 1e-9 -N 1e6 \
-f rad -c1 CTGCAG \
-dm 2 -ds 5 \
-o $tree\_rad_covlow/$tree
done
%%bash
pyrad --version
pyRAD 3.0.66
%%bash
## new params file (remove existing file if present)
rm params.txt
pyrad -n
new params.txt file created
%%bash
## enter parameters into params file using sed
sed -i '/## 1. /c\Tbal_rad_drop ## 1. working dir ' params.txt
sed -i '/## 2. /c\Tbal_rad_drop/*.gz ## 2. data loc ' params.txt
sed -i '/## 3. /c\Tbal_rad_drop/*barcodes.txt ## 3. Bcode ' params.txt
sed -i '/## 6. /c\TGCAG,AATT ## 6. cutters ' params.txt
sed -i '/## 7. /c\20 ## 7. Nproc ' params.txt
sed -i '/## 10. /c\.82 ## 10. clust thresh' params.txt
sed -i '/## 11. /c\rad ## 11. datatype ' params.txt
sed -i '/## 12. /c\2 ## 12. minCov ' params.txt
sed -i '/## 13. /c\10 ## 13. maxSH' params.txt
sed -i '/## 14. /c\Tbal ## 14. outname' params.txt
sed -i '/## 24./c\99 ## 24. maxH' params.txt
sed -i '/## 30./c\n,p,s ## 30. out format' params.txt
## IPython code to iterate over trees and coverages and run pyrad
## sometimes this freezes when run in a jupyter notebook due
## to problems with multiprocessing in notebooks. This is why my new
## work with ipyrad uses ipyparallel instead of multiprocessing.
for tree in ['Tbal', 'Timb', 'Tvib']:
for dtype in ['rad', 'ddrad']:
with open('params.txt', 'rb') as params:
pp = params.readlines()
pp[1] = "{}_{}_drop ## 1. \n".format(tree, dtype)
pp[2] = "{}_{}_drop/*.gz ## 2. \n".format(tree, dtype)
pp[3] = "{}_{}_drop/*barcodes.txt ## 3. \n".format(tree, dtype)
pp[14] = "{} ## 14. \n".format(tree)
with open('params.txt', 'wb') as params:
params.write("".join(pp))
## this calls pyrad as a bash script
! pyrad -p params.txt >> log.txt 2>&1
for cov in ['full', 'med', 'low']:
with open('params.txt', 'rb') as params:
pp = params.readlines()
pp[1] = "{}_rad_cov{} ## 1. \n".format(tree, cov)
pp[2] = "{}_rad_cov{}/*.gz ## 2. \n".format(tree, cov)
pp[3] = "{}_rad_cov{}/*barcodes.txt ## 3. \n".format(tree, cov)
pp[14] = "{} ## 14. \n".format(tree)
with open('params.txt', 'wb') as params:
params.write("".join(pp))
## this calls pyrad as a bash script
! pyrad -p params.txt >> log.txt 2>&1
def getarray(locifile, tree):
""" parse the loci list and return a
presence/absence matrix ordered by
the tips on the tree"""
## parse the loci file
loci = open(locifile).read().split("\n//")[:-1]
## order (ladderize) the tree
tree.ladderize()
## get tip names
names = tree.get_leaf_names()
## make empty matrix
lxs = np.zeros((len(names), len(loci)))
## fill the matrix
for loc in xrange(len(loci)):
for seq in loci[loc].split("\n"):
if ">" in seq:
lxs[names.index(seq.split()[0][1:].rsplit("_", 1)[0]),loc] += 1
return lxs
def countmatrix(lxsabove, lxsbelow, max=0):
""" fill a matrix with pairwise data sharing
between each pair of samples. You could put
in two different 'share' matrices to have
different results above and below the diagonal.
Can enter a max value to limit fill along diagonal.
"""
share = np.zeros((lxsabove.shape[0],
lxsbelow.shape[0]))
## fill above
names = range(lxsabove.shape[0])
for row in lxsabove:
for samp1,samp2 in itertools.combinations(names,2):
shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
share[samp1,samp2] = shared
## fill below
for row in lxsbelow:
for samp2,samp1 in itertools.combinations(names,2):
shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
share[samp1,samp2] = shared
## fill diagonal
if not max:
for row in range(len(names)):
share[row,row] = lxsabove[row,].sum()
else:
for row in range(len(names)):
share[row,row] = max
return share
def plotSVGmatrix(share, outname):
surf = plt.pcolormesh(share, cmap="gist_yarg")
dims = plt.axis('image')
surf.axes.get_xaxis().set_ticklabels([])
surf.axes.get_xaxis().set_ticks([])
surf.axes.get_yaxis().set_ticklabels([])
surf.axes.get_yaxis().set_ticks([])
ax = plt.gca()
ax.invert_yaxis()
plt.colorbar(surf, aspect=15)
if outname:
plt.savefig(outname+".svg")
def fullplot(locifile, tree, outname=None):
lxsB = getarray(locifile, tree)
share = countmatrix(lxsB, lxsB)
plotSVGmatrix(share, outname)
fullplot('Tbal_rad_drop/outfiles/Tbal.loci', Tbal, 'Tbal_drop')
fullplot('Tbal_rad_covlow/outfiles/Tbal.loci', Tbal, 'Tbal_covlow')
fullplot('Tbal_rad_covmed/outfiles/Tbal.loci', Tbal, 'Tbal_covmed')
fullplot('Tbal_rad_covfull/outfiles/Tbal.loci', Tbal, 'Tbal_covfull')
fullplot('Timb_rad_drop/outfiles/Timb.loci', Timb, 'Timb_drop')
fullplot('Tvib_rad_drop/outfiles/Tvib.loci', Tvib, 'Tvib_drop')
fullplot('Tvib_rad_covlow/outfiles/Tvib.loci', Tvib, 'Tvib_covlow')
First we re-calculate the pair-wise data sharing matrices for all species in each data set.
lxs_Tbal_droprad = getarray("Tbal_rad_drop/outfiles/Tbal.loci", Tbal)
lxs_Tbal_dropddrad = getarray("Tbal_ddrad_drop/outfiles/Tbal.loci", Tbal)
lxs_Tbal_covlow = getarray("Tbal_rad_covlow/outfiles/Tbal.loci", Tbal)
lxs_Tbal_covmed = getarray("Tbal_rad_covmed/outfiles/Tbal.loci", Tbal)
lxs_Tbal_covfull = getarray("Tbal_rad_covfull/outfiles/Tbal.loci", Tbal)
lxs_Timb_droprad = getarray("Timb_rad_drop/outfiles/Timb.loci", Timb)
lxs_Timb_dropddrad = getarray("Timb_ddrad_drop/outfiles/Timb.loci", Timb)
lxs_Timb_covlow = getarray("Timb_rad_covlow/outfiles/Timb.loci", Timb)
lxs_Timb_covmed = getarray("Timb_rad_covmed/outfiles/Timb.loci", Timb)
lxs_Timb_covfull = getarray("Timb_rad_covfull/outfiles/Timb.loci", Timb)
lxs_Tvib_droprad = getarray("Tvib_rad_drop/outfiles/Tvib.loci", Tvib)
lxs_Tvib_dropddrad = getarray("Tvib_ddrad_drop/outfiles/Tvib.loci", Tvib)
lxs_Tvib_covlow = getarray("Tvib_rad_covlow/outfiles/Tvib.loci", Tvib)
lxs_Tvib_covmed = getarray("Tvib_rad_covmed/outfiles/Tvib.loci", Tvib)
lxs_Tvib_covfull = getarray("Tvib_rad_covfull/outfiles/Tvib.loci", Tvib)
def count_inf4(tree, matrix, node):
""" count the number of loci with data spanning
a given node in the tree """
## get children of selected node
a, b = node.get_children()
## get tip descendents of a and b
tips_a = set(a.get_leaf_names())
tips_b = set(b.get_leaf_names())
## get every other tip (outgroups)
upone = node.up
if upone.is_root():
ch = upone.children
sis = [i for i in ch if i != node][0]
if sis.children:
tips_c = sis.children[0].get_leaf_names()
tips_d = sis.children[1].get_leaf_names()
else:
return 0
else:
upone = set(node.up.get_leaf_names())
tips_c = upone - tips_a - tips_b
tips_all = set(tree.get_leaf_names())
tips_d = tips_all - tips_a - tips_b - tips_c
## get indices in matrix for leaf tips
names = tree.get_leaf_names()
index_a = [names.index(i) for i in tips_a]
index_b = [names.index(i) for i in tips_b]
index_c = [names.index(i) for i in tips_c]
index_d = [names.index(i) for i in tips_d]
## how man loci are "informative"
inf = 0
for col in matrix.T:
hits_a = sum([col[i] for i in index_a])
hits_b = sum([col[i] for i in index_b])
hits_c = sum([col[i] for i in index_c])
hits_d = sum([col[i] for i in index_d])
if all([hits_a, hits_b, hits_c, hits_d]):
inf += 1
return inf
Here I iterate over each node and apply count_inf4 which returns the number of loci that are informative for the subtending bipartition, and count_snps which counts snps segregating at that bipartition. This takes a few minutes to run.
def nodes_dat(tree, lxs, datfilename):
dat = []
for node in tree.traverse():
if not (node.is_leaf() or node.is_root()):
loci = count_inf4(tree, lxs, node)
dist = round(tree.get_distance(node),2)
dat.append([dist, loci])
node.name = "%d" % loci
## print tree with bls & node labels
tree.write(format=3,outfile=datfilename+".tre")
## print data to file
with open(datfilename, 'w') as outfile:
np.savetxt(outfile, np.array(dat), fmt="%.2f")
%%bash
## a new directory to store the data in
mkdir -p analysis_counts2
nodes_dat(Tbal, lxs_Tbal_droprad,
"analysis_counts2/Tbal_droprad.dat3")
nodes_dat(Tbal, lxs_Tbal_dropddrad,
"analysis_counts2/Tbal_dropddrad.dat3")
nodes_dat(Tbal, lxs_Tbal_covlow,
"analysis_counts2/Tbal_covlow.dat3")
nodes_dat(Tbal, lxs_Tbal_covmed,
"analysis_counts2/Tbal_covmed.dat3")
nodes_dat(Tbal, lxs_Tbal_covfull,
"analysis_counts2/Tbal_covfull.dat3")
nodes_dat(Timb, lxs_Timb_droprad,
"analysis_counts2/Timb_droprad.dat3")
nodes_dat(Timb, lxs_Timb_dropddrad,
"analysis_counts2/Timb_dropddrad.dat3")
nodes_dat(Timb, lxs_Timb_covlow,
"analysis_counts2/Timb_covlow.dat3")
nodes_dat(Timb, lxs_Timb_covmed,
"analysis_counts2/Timb_covmed.dat3")
nodes_dat(Timb, lxs_Timb_covfull,
"analysis_counts2/Timb_covfull.dat3")
nodes_dat(Tvib, lxs_Tvib_droprad,
"analysis_counts2/Tvib_droprad.dat3")
nodes_dat(Tvib, lxs_Tvib_dropddrad,
"analysis_counts2/Tvib_dropddrad.dat3")
nodes_dat(Tvib, lxs_Tvib_covlow,
"analysis_counts2/Tvib_covlow.dat3")
nodes_dat(Tvib, lxs_Tvib_covmed,
"analysis_counts2/Tvib_covmed.dat3")
nodes_dat(Tvib, lxs_Tvib_covfull,
"analysis_counts2/Tvib_covfull.dat3")
%load_ext rpy2.ipython
%%R
library(ape)
%%R
## read in the data and factor results
dat <- read.table("analysis_counts2/Tbal_droprad.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tbal_droprad_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tbal_dropddrad.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tbal_dropddrad_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tbal_covlow.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tbal_covlow_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tbal_covmed.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tbal_covmed_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tbal_covfull.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tbal_covfull_Lme <- with(dat, tapply(loci, depth, mean))
%%R
## read in the data and factor results
dat <- read.table("analysis_counts2/Timb_droprad.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Timb_droprad_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Timb_dropddrad.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Timb_dropddrad_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Timb_covlow.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Timb_covlow_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Timb_covmed.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Timb_covmed_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Timb_covfull.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Timb_covfull_Lme <- with(dat, tapply(loci, depth, mean))
%%R
## read in the data and factor results
dat <- read.table("analysis_counts2/Tvib_droprad.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tvib_droprad_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tvib_dropddrad.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tvib_dropddrad_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tvib_covlow.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tvib_covlow_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tvib_covmed.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tvib_covmed_Lme <- with(dat, tapply(loci, depth, mean))
dat <- read.table("analysis_counts2/Tvib_covfull.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
Tvib_covfull_Lme <- with(dat, tapply(loci, depth, mean))
%%R -w 400 -h 400
#svg("box1.svg", width=4, height=5)
L = Tbal_droprad_Lme
plot(L, xlim=c(0,6), ylim=c(575,1025),
cex.axis=1.25, type='n', xaxt="n")
#abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Tbal_covfull_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Tbal_droprad_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#6E6E6E", bg="#6E6E6E", lwd=0.75)
#L = Tbal_dropddrad_Lme
#df1 = data.frame(as.numeric(names(L)),as.numeric(L))
#points(df1, cex=3.5, pch=21, col="#6E6E6E", bg="#D3D3D3", lwd=0.75)
box()
axis(side=1, at=seq(0,6,0.5),
labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)
#dev.off()
%%R -w 400 -h 400
#svg("box2.svg", width=4, height=5)
L = Tbal_covlow_Lme
plot(L, xlim=c(0,6), ylim=c(575,1025),
cex.axis=1.25, type='n', xaxt="n")
abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Tbal_covfull_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Tbal_covmed_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
#L = Tbal_covlow_Lme
#df1 = data.frame(as.numeric(names(L)),as.numeric(L))
#points(df1, cex=3.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)
box()
axis(side=1, at=seq(0,6,0.5),
labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)
#dev.off()
%%R -w 400 -h 400
#svg("box3.svg", width=4, height=5)
## samples every 6th to make plot more readable
L = Timb_droprad_Lme[c(3:62)]
plot(L, xlim=c(0,6), ylim=c(575, 1025),
cex.axis=1.25, type='n', xaxt="n")#, yaxt="n")
#abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Timb_covfull_Lme[seq(3, 65, 6)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Timb_droprad_Lme[seq(3, 65, 6)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
box()
axis(side=1, at=seq(0,6, 0.5),
labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)
#dev.off()
%%R -w 400 -h 400
## samples every 6th to make plot more readable
#svg("box4.svg", width=4, height=5)
L = Timb_droprad_Lme[c(3:62)]
plot(L, xlim=c(0,6), ylim=c(575, 1025),
cex.axis=1.25, type='n', xaxt="n")#, yaxt="n")
#abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Timb_covfull_Lme[seq(3, 65, 6)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Timb_covmed_Lme[seq(3, 65, 6)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
lines(df1, type='l', lwd=2, lty=2)
points(df1, cex=2.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
box()
axis(side=1, at=seq(0,6, 0.5),
labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)
#dev.off()
%%R -w 400 -h 500
#svg("Sourcemissing.svg", height=10, width=8)
#svg("Sourcemissing.svg", height=7, width=5.33)
mat2 = matrix(c(1,1,1,4,4,4,7,7,7,
1,1,1,4,4,4,7,7,7,
2,2,2,5,5,5,8,8,8,
3,3,3,6,6,6,9,9,9),
4,9, byrow=TRUE)
layout(mat2)
par(mar=c(1,1,0,1),
oma=c(2,2,1,0))
#########################################################
tre <- read.tree("Tbal.tre")
plot(tre, show.tip.label=F,
edge.width=2.5, type='p',
x.lim=c(0,6))
####
L = Tbal_droprad_Lme
plot(L, xlim=c(0,6), ylim=c(-25,1200),
cex.axis=1.25, type='n', xaxt="n")
abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Tbal_covfull_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Tbal_droprad_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
L = Tbal_dropddrad_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)
box()
####
L = Tbal_covlow_Lme
plot(L, xlim=c(0,6), ylim=c(-25,1200),
cex.axis=1.25, type='n', xaxt="n")
abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Tbal_covfull_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Tbal_covmed_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
L = Tbal_covlow_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)
axis(side=1, at=seq(0,6,0.5),
labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)
box()
##########################################################
tre <- read.tree("Timb.tre")
plot(tre, show.tip.label=F,
edge.width=2.5, type='p',
x.lim=c(0,6))
####
L = Timb_droprad_Lme[c(3:62)]
plot(L, xlim=c(0,6), ylim=c(-25,1200),
cex.axis=1.25, type='n', xaxt="n", yaxt="n")
abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Timb_covfull_Lme[c(3:62)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Timb_droprad_Lme[c(3:62)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
L = Timb_dropddrad_Lme[c(3:62)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)
box()
####
L = Timb_covlow_Lme[c(3:62)]
plot(L, xlim=c(0,6), ylim=c(-25,1200),
cex.axis=1.25, type='n', xaxt="n", yaxt="n")
abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Timb_covfull_Lme[c(3:62)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Timb_covmed_Lme[c(3:62)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
L = Timb_covlow_Lme[c(3:62)]
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)
box()
##
axis(side=1, at=seq(0,6,0.5),
labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)
#########################################################
tre <- read.tree("Tvib.tre")
plot(tre, show.tip.label=F,
edge.width=2.5, type='p',
x.lim=c(0,6))
####
L = Tvib_droprad_Lme
plot(L, xlim=c(0,6.25), ylim=c(-25,1200),
cex.axis=1.25, type='n', xaxt="n", yaxt="n")
abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Tvib_covfull_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Tvib_droprad_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
L = Tvib_dropddrad_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)
box()
####
plot(L, xlim=c(0,6.25), ylim=c(-25,1200),
cex.axis=1.25, type='n', xaxt="n", yaxt="n")
abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")
L = Tvib_covfull_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)
L = Tvib_covmed_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)
L = Tvib_covlow_Lme
df1 = data.frame(as.numeric(names(L)),as.numeric(L))
points(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)
box()
##
axis(side=1, at=seq(0,6,1),
labels=as.character(seq(6,0,by=-1)), cex.axis=1.25)
#dev.off()
Here I am grabbing the assembled empirical data from notebook_1 (Viburnum) to compare the effect of sequencing coverage with the results we see when simulating data on that tree.
Tvib2 = Tvib.copy()
for node in Tvib2:
node.name = node.name+"_0"
## full size data
lxs_EmpVib_full = getarray("/home/deren/Dropbox/RADexplore/EmpVib/vib_full_64tip_c85d6m4p99.loci", Tvib)#, dropind=1)
lxs_EmpVib_half = getarray("/home/deren/Dropbox/RADexplore/EmpVib/vib_half_64tip_c85d6m4p99.loci", Tvib)#, dropind=1)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-70-59562b793f2b> in <module>() 1 ## full size data 2 #lxs_EmpVib_full = getarray("/home/deren/Dropbox/RADexplore/EmpVib/vib_full_64tip_c85d6m4p99.loci", Tvib)#, dropind=1) ----> 3 lxs_EmpVib_half = getarray("/home/deren/Dropbox/RADexplore/EmpVib/vib_half_64tip_c85d6m4p99.loci", Tvib)#, dropind=1) <ipython-input-30-282cdcd80fbf> in getarray(locifile, tree) 15 for seq in loci[loc].split("\n"): 16 if ">" in seq: ---> 17 lxs[names.index(seq.split()[0][1:].rsplit("_", 1)[0]),loc] += 1 18 return lxs ValueError: 'acutifolium_DRY3_MEX' is not in list
share_full = countmatrix(lxs_EmpVib_full,lxs_EmpVib_full)
plotSVGmatrix(share_full, "EmpVib_full")
share_half = countmatrix(lxs_EmpVib_half,lxs_EmpVib_half)
plotSVGmatrix(share_half, "EmpVib_half")
nodes_dat(Tvib, lxs_EmpVib_half,
"analysis_counts/Tvib_Emp_half.dat3")
nodes_dat(Tvib, lxs_EmpVib_full,
"analysis_counts/Tvib_Emp_full.dat3")
%%R
## read in the data and factor results
dat <- read.table("analysis_counts/Tvib_Emp_full.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
EmpVib_full_Lme <- with(dat, tapply(loci, depth, mean))
## read in the data and factor results
dat <- read.table("analysis_counts/Tvib_Emp_half.dat3",
header=F, col.names=c('depth','loci'))
dat[,1] <- as.factor(dat[,1])
EmpVib_half_Lme <- with(dat, tapply(loci, depth, mean))
%%R
EmpVib_half_Lme
%%R -w 200 -h 400
#svg("EmpVib_fullvhalf3.svg", height=6, width=2.5)
mat2 <- matrix(c(1,1,1,2),byrow=TRUE)
layout(mat2)
par(mar=c(1,1,0,1),
oma=c(2,2,1,0))
#########################################################
#tre <- read.tree("Tvib.tre")
#plot(tre, show.tip.label=F,
# edge.width=2.5, type='p',
# x.lim=c(0,2.25))
Vtre <- read.tree("analysis_counts/EmpVib_full.dat3.tre")
plot(Vtre, cex=0.6, adj=0.05, x.lim=c(0,2.25),
edge.width=2.5, type='p',show.tip.label=FALSE)
nodelabels(pch=20, col="black",
cex=as.integer(Vtre$node.label)/7500)
####
L = EmpVib_half_Lme
plot(L, xlim=c(0,2.25), ylim=c(-25,50200),
cex.axis=1.25, type='n', xaxt="n")
df1 = data.frame(as.numeric(names(L)),
as.numeric(L))
points(df1, cex=1.25, pch=21, col="red", bg="red")
####
L = EmpVib_full_Lme
#plot(L, xlim=c(0,2.25), ylim=c(-25,50200),
# cex.axis=1.25, type='n', xaxt="n")
abline(h=0, lwd=2, col="gray", lty="dotted")
df1 = data.frame(as.numeric(names(L)),
as.numeric(L))
points(df1, cex=1.25, pch=21, col="#262626", bg="#262626")
#dev.off()
%%R -w 200 -h 400
#svg("fullvhalf3.svg", height=4.5, width=4)
####
L = EmpVib_full_Lme
plot(L, xlim=c(0,2.25), ylim=c(-25,50200),
cex.axis=1.25, type='n', xaxt="n")
abline(h=0, lwd=2, col="gray", lty="dotted")
df1 = data.frame(as.numeric(names(L)),
as.numeric(L))
points(df1, cex=1.25, pch=21, col="#262626", bg="#262626")
dev.off()
svg("fullonly.svg", height=4.5, width=4)
####
L = EmpVib_half_Lme
plot(L, xlim=c(0,2.25), ylim=c(-25,50200),
cex.axis=1.25, type='n', xaxt="n")
df1 = data.frame(as.numeric(names(L)),
as.numeric(L))
points(df1, cex=1.25, pch=21, col="red", bg="red")
####
L = EmpVib_full_Lme
abline(h=0, lwd=2, col="gray", lty="dotted")
df1 = data.frame(as.numeric(names(L)),
as.numeric(L))
points(df1, cex=1.25, pch=21, col="#262626", bg="#262626")
#dev.off()
%%R
data.frame(cbind(median(EmpVib_full_Lme),
median(EmpVib_half_Lme)),
cbind(mean(EmpVib_full_Lme),
mean(EmpVib_half_Lme)),
col.names=c("full","half"))
%%R
svg('hist.svg', width=4.25, height=4)
hist(rnorm(10000), col="grey")
dev.off()
%%R -w 300 -h 600
Vtre <- read.tree("analysis_counts/EmpVib_full.dat3.tre")
svg("EmpVib_full_nodes.svg", height=6, width=3)
plot(Vtre, cex=0.6, adj=0.05,
edge.width=3, show.tip.label=FALSE)
nodelabels(pch=20, col="black",
cex=as.integer(Vtre$node.label)/10000)
dev.off()
%%R -w 300 -h 600
#svg("locisnpsdepth.svg", height=8, width=4)
#pdf("locisnpsdepth.pdf", height=8, width=4)
mat2 <- matrix(c(1,1,1,5,5,5,
1,1,1,5,5,5,
2,2,2,6,6,6,
3,3,3,7,7,7,
4,4,4,8,8,8),
5,6, byrow=TRUE)
layout(mat2)
par(mar=c(1,1,0,1),
oma=c(2,2,1,0))
tre <- read.tree("Tbal.tre")
plot(tre, show.tip.label=F,
edge.width=2.5, type='p',
x.lim=c(-0.25,2.75))
##-------------------------------
## Plot full data locus sharing
x = seq(1.5,5.5)
y = Tbal_full_Lme#[1:6]
s = Tbal_full_Lsd#[1:6]
plot(x, y, xlim=c(1,7.2), ylim=c(-25,3100),
cex.axis=1.25, type='n', xaxt="n")
abline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")
points(x,y, cex=1.25, pch=21, col="#262626", bg="#262626")
lines(x,y, lwd=2, col="#262626")
segments(x, y-s, x, y+s, col="#262626")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
x = seq(1.5,5.5)
y = Tbal_full_Sme#[2:6]
s = Tbal_full_Ssd#[2:6]
lines(x, y, lwd=2, col="darkgrey")
points(x, y, cex=1.25, pch=21, col="darkgrey", bg="darkgrey")
segments(x, y-s, x, y+s, col="darkgrey")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
##--------------------------------
## Plot drop data locus sharing
x = seq(1.5,5.5)
y = Tbal_drop_Lme
s = Tbal_drop_Lsd
plot(x, y, xlim=c(1,7.2), ylim=c(-25,3100),
cex.axis=1.25, type='n', xaxt="n")
abline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")
points(x,y, cex=1.25, pch=21, col="#262626", bg="#262626")
lines(x,y, lwd=2, col="#262626")
segments(x, y-s, x, y+s, col="#262626")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
x = seq(1.5,5.5)
y = Tbal_drop_Sme
s = Tbal_drop_Ssd
lines(x, y, lwd=2, col="darkgrey")
points(x, y, cex=1.25, pch=21, col="darkgrey", bg="darkgrey")
segments(x, y-s, x, y+s, col="darkgrey")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
##------------------------------
## Plot cov data locus sharing
x = seq(1.5,5.5)
y = Tbal_cov_Lme
s = Tbal_cov_Lsd
plot(x, y, xlim=c(1,7.2), ylim=c(-25,3100),
cex.axis=1.25, type='n', xaxt="n")
abline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")
points(x,y, cex=1.25, pch=21, col="#262626", bg="#262626")
lines(x,y, lwd=2, col="#262626")
segments(x, y-s, x, y+s, col="#262626")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
x = seq(1.5,5.5)
y = Tbal_cov_Sme
s = Tbal_cov_Ssd
lines(x, y, lwd=2, col="darkgrey")
points(x, y, cex=1.25, pch=21, col="darkgrey", bg="darkgrey")
segments(x, y-s, x, y+s, col="darkgrey")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
axis(side=1, at=seq(1.1,8.1,1),
labels=as.character(seq(3,-0.5,by=-0.5)), cex.axis=1.25)
###########################################
###########################################
tre <- read.tree("Timb.tre")
plot(tre, show.tip.label=F,
edge.width=2, type='p')
##------------------------------------
## Plot full data locus sharing
x = seq(2,62)
y = Timb_full_Lme[2:62]
s = Timb_full_Lsd[2:62]
plot(x, y, xlim=c(1,65), ylim=c(-25,3100),
cex.axis=1.25, type='n', yaxt="n", xaxt="n")
abline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")
points(x,y, cex=1, pch=21, col="#262626", bg="#262626")
lines(x,y, lwd=2, col="#262626")
segments(x, y-s, x, y+s, col="#262626")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
y = Timb_full_Sme[2:62]
s = Timb_full_Ssd[2:62]
lines(x, y, lwd=2, col="darkgrey")
#points(x, y, cex=1, pch=21, col="darkgrey", bg="darkgrey")
segments(x, y-s, x, y+s, col="darkgrey")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
##----------------------------------
## Plot full data locus sharing
x = seq(2,62)
y = Timb_drop_Lme[2:62]
s = Timb_drop_Lsd[2:62]
plot(x, y, xlim=c(1,65), ylim=c(-25,3100),
cex.axis=1.25, type='n', yaxt="n", xaxt="n")
abline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")
points(x,y, cex=1, pch=21, col="#262626", bg="#262626")
lines(x,y, lwd=2, col="#262626")
segments(x, y-s, x, y+s, col="#262626")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
y = Timb_drop_Sme[2:62]
s = Timb_drop_Ssd[2:62]
lines(x, y, lwd=2, col="darkgrey")
#points(x, y, cex=1, pch=21, col="darkgrey", bg="darkgrey")
segments(x, y-s, x, y+s)
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
##-----------------------------------
## Plot drop data locus sharing
x = seq(2,62)
y = Timb_cov_Lme[2:62]
s = Timb_cov_Lsd[2:62]
plot(x, y,
xlim=c(1,65), ylim=c(-20,3100),
cex=1, cex.axis=1.25,
pch=21, bg="#262626", xaxt="n", yaxt="n")
abline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")
points(x,y, cex=1, pch=21, col="#262626", bg="#262626")
lines(x,y, lwd=2, col="#262626")
segments(x, y-s, x, y+s, col="#262626")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
y = Timb_cov_Sme[2:62]
s = Timb_cov_Ssd[2:62]
lines(x, y, lwd=2, col="darkgrey")
#points(x, y, cex=1, pch=21, col="darkgrey", bg="darkgrey")
segments(x, y-s, x, y+s, col="darkgrey")
epsilon = 0.02
segments(x-epsilon,y-s,x+epsilon,y-s)
segments(x-epsilon,y+s,x+epsilon,y+s)
axis(side=1, at=seq(2.1,72,10),
labels=as.character(seq(3,0,by=-0.5)), cex.axis=1.25)
#dev.off()
lxs_EmpVib_full
def write_nodes_to_tree(tree,lxs,treefilename):
for node in tree.traverse():
if not node.is_leaf():
inf = count_inf4(tree, lxs, node)
node.name = "%d" % inf
## print tree with bls & node labels
tree.write(format=3,outfile=treefilename)
write_nodes_to_tree(Tvib, lxs_EmpVib_full, "Tvib_full_nodes.tre")
%%R -w 400 -h 500
tre <- read.tree("loci_Tbal_cov")
plot(tre)#, show.tip.label=F, edge.width=2.5)
#nodelabels(pch=21,
# bg="#262626",
# cex=as.integer(tre$node.label)/150)
nodelabels(tre$node.label, bg='grey', cex=1.5)
How much data are shared by a random N samples, and how much data are shared across the deepest bipartition for 2+N samples. Also how many SNPs?
def counts(lxs, minr, maxr, maxi):
## data store
data = np.zeros((maxr+1-minr,maxi))
for sample in range(minr, maxr+1):
g = itertools.combinations(range(maxr), sample)
i = 0
while i<maxi:
try:
gsamp = g.next()
except StopIteration:
break
shared = sum(lxs[gsamp,:].sum(axis=0) == len(gsamp))
data[sample-minr,i] = shared
i += 1
return data
Dbal = counts(lxs_Tbal_drop, 4, 32, 1000)
Dimb = counts(lxs_Timb_drop, 4, 32, 1000)
Dbal
def counts2(lxs, minr, maxr, maxi):
## data store
data = np.zeros(((maxr+1-minr)*maxi, 3))
count = 0
for sample in range(minr, maxr+1):
g = itertools.combinations(range(maxr), sample)
i = 0
while i<maxi:
try:
gsamp = g.next()
except StopIteration:
break
shared = sum(lxs[gsamp,:].sum(axis=0) == len(gsamp))
datum = [sample, float(shared), i+1]
data[count] = datum
i += 1
count += 1
return data
Dimb1 = counts2(lxs_Timb_full, 4, 32, 100)
Dimb2 = counts2(lxs_Timb_drop, 4, 32, 100)
Dimb3 = counts2(lxs_Timb_cov, 4, 32, 100)
Dbal1 = counts2(lxs_Tbal_full, 4, 32, 100)
Dbal2 = counts2(lxs_Tbal_drop, 4, 32, 100)
Dbal3 = counts2(lxs_Tbal_cov, 4, 32, 100)
def saveto(D, outname):
dd = pd.DataFrame({"time":[i[0] for i in D],
"loci":[i[1] for i in D],
"idx":[i[2] for i in D],
"tree":["Timb" for _ in D]})
dd.to_csv(outname)
saveto(Dimb1, "Dimb1.dat")
saveto(Dimb2, "Dimb2.dat")
saveto(Dimb3, "Dimb3.dat")
saveto(Dbal1, "Dbal1.dat")
saveto(Dbal2, "Dbal2.dat")
saveto(Dbal3, "Dbal3.dat")