#!/usr/bin/env python
# coding: utf-8

# # RADseq data simulations

# I simulated two trees to work with. One that is completely imbalanced (ladder-like) and one that is balanced (equal number tips descended from each node). I'm using the Python package `ete2` for most of the tree manipulations. This notebook was run in Python 2.7. You will also need the package rpy2 installed, as well as a working version of R with the package 'ape' to make tree plots later in the notebook. 

# In[28]:


## standard Python imports
import glob
import itertools
from collections import OrderedDict, Counter

## extra Python imports
import rpy2              ## required for tree plotting
import ete2              ## used for tree manipulation
import egglib            ## used for coalescent simulations
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[29]:


## print versions
for pkg in [matplotlib, np, pd, ete2, rpy2]:
    print "{:<10}\t{:<10}".\
    format(pkg.__name__, pkg.__version__)


# ### Simulation software
# I wrote a program to simulate RAD-seq like sequence data which uses the python package egglib for coalescent simulations. Below will check that you have the relevant software installed. See here for simrrls installation: https://github.com/dereneaton/simrrls
# 

# In[20]:


## check simrrls package and requirements
import egglib
import simrrls


# In[21]:


## print versions
print 'egglib', egglib.version
print 'simrrls', simrrls.__version__


# # Generate trees for simulations
# 
# ### Make a balanced tree with 64 tips and tree length = 6

# In[5]:


## base tree
Tbal = ete2.Tree()

## branch lengths
bls = 1.

## namer
n = iter(('s'+str(i) for i in xrange(1,1500)))

## first nodes
n1 = Tbal.add_child(name=n.next(), dist=bls)
n2 = Tbal.add_child(name=n.next(), dist=bls)

## make balanced tree
while len(Tbal.get_leaves()) < 64:
    thisrep = Tbal.get_descendants()
    for node in thisrep:
        if len(node.get_children()) < 1:
            node.add_child(name=n.next(), dist=bls)
            node.add_child(name=n.next(), dist=bls)

## Save newick string to file
Tbal.write(outfile="Tbal.tre", format=3)


# String representation

# In[6]:


## newick string
get_ipython().system(' cat Tbal.tre')


# In[7]:


## show tree, remove node circles
#for node in Tbal.traverse():
#    node.img_style["size"] = 0
#Tbal.render("%%inline", h=500)


# ## Make an imbalanced tree of same treelength with 64 tips

# In[8]:


## base tree
Timb = ete2.Tree()

## namer
n = iter(('s'+str(i) for i in range(1,5000)))

## scale branches to match balanced treelength
brlen = (bls*6.)/63

## first nodes
n1 = Timb.add_child(name=n.next(), dist=brlen)
n2 = Timb.add_child(name=n.next(), dist=brlen)

while len(Timb.get_leaves()) < 64:  
    ## extend others
    for tip in Timb.get_leaves()[:-1]:
        tip.dist += brlen
    ## extend the last node
    Timb.get_leaves()[-1].add_child(name=n.next(), dist=brlen)
    Timb.get_leaves()[-1].add_sister(name=n.next(), dist=brlen)

## write to file
Timb.write(outfile="Timb.tre", format=3)


# Or copy the following string to a file:

# In[9]:


get_ipython().system(' cat Timb.tre')


# In[10]:


## show tree, remove node circles
#for node in Timb.traverse():
#    node.img_style["size"] = 0
#Timb.render("%%inline", h=500)


# Check that the trees are the same length (close enough).

# In[11]:


print set([i.get_distance(Tbal) for i in Tbal]), 'treelength'
print len(Tbal), 'tips'

print set([i.get_distance(Timb) for i in Timb]), 'treelength'
print len(Timb), 'tips'


# ## An ultrametric topology of Viburnum w/ 64 tips
# This tree is inferred in notebook 3, and here it is scaled with penalized likelihood to be ultrametric.

# In[12]:


get_ipython().run_line_magic('load_ext', 'rpy2.ipython')


# In[13]:


get_ipython().run_cell_magic('R', '-w 400 -h 600', 'library(ape)\n\n## make tree ultrametric using penalized likelihood\nVtree <- read.tree("~/Dropbox/RAxML_bestTree.VIB_small_c85d6m4p99")\nUtree <- drop.tip(Vtree, "clemensiae_DRY6_PWS_2135")\nUtree <- ladderize(chronopl(Utree, 0.5))\n\n## multiply bls so tree length=6 after dropping outgroup\nUtree$edge.length <- Utree$edge.length*6\n\n## save the new tree\nwrite.tree(Utree, "Tvib.tre")\nplot(Utree, cex=0.7, edge.width=2)\nadd.scale.bar()\n#edgelabels(round(Utree$edge.length,3))\n')


# In[14]:


#### load TVib tree into Python and print newick string
Tvib = ete2.Tree("Tvib.tre")
get_ipython().system(' cat Tvib.tre')


# # Simulate sequence data on each tree
# Here I use the _simrrls_ program to simulate RADseq data on each input topology with locus dropout occurring with respect to phylogenetic distances. Find simrrls in my github profile.

# ## Comparing tree shapes and sources of missing data

# In[15]:


get_ipython().run_cell_magic('bash', '', '## balanced tree \nmkdir -p Tbal_rad_drop/\nmkdir -p Tbal_ddrad_drop/\nmkdir -p Tbal_rad_covfull/\nmkdir -p Tbal_rad_covlow/\nmkdir -p Tbal_rad_covmed/\n\n## imbalanced tree\nmkdir -p Timb_rad_drop/\nmkdir -p Timb_ddrad_drop/\nmkdir -p Timb_rad_covfull/\nmkdir -p Timb_rad_covlow/\nmkdir -p Timb_rad_covmed/\n\n## sims on empirical Viburnum topo\nmkdir -p Tvib_rad_drop/\nmkdir -p Tvib_ddrad_drop/\nmkdir -p Tvib_rad_covfull/\nmkdir -p Tvib_rad_covlow/\nmkdir -p Tvib_rad_covmed\n')


# #### Show simrrls options

# In[16]:


get_ipython().run_cell_magic('bash', '', 'simrrls -h\n')


# ## Simulate RAD data on different trees and with sampling
# Here I simulate 1000 loci on each tree. For each tree data are simulated in 5 ways. With and without data loss from mutation-disruption or low sequencing coverage, and as a rad data type (one cutter) and ddrad (two cutters). This will take about 10 minutes to run probably.

# In[18]:


get_ipython().run_cell_magic('bash', '', 'for tree in Tbal Timb Tvib;\n  do \n    simrrls -mc 1 -ms 1 -t $tree.tre \\\n            -L 1000 -l 100 \\\n            -u 1e-9 -N 1e6 \\\n            -f rad -c1 CTGCAG \\\n            -o $tree\\_rad_drop/$tree\n            \n    simrrls -mc 1 -ms 1 -t $tree.tre \\\n            -L 1000 -l 100 \\\n            -u 1e-9 -N 1e6 \\\n            -f ddrad -c1 CTGCAG -c2 AATT \\\n            -o $tree\\_ddrad_drop/$tree        \n    \n    simrrls -mc 0 -ms 0 -t $tree.tre \\\n            -L 1000 -l 100 \\\n            -u 1e-9 -N 1e6 \\\n            -f rad -c1 CTGCAG \\\n            -o $tree\\_rad_covfull/$tree\n\n    simrrls -mc 0 -ms 0 -t $tree.tre \\\n            -L 1000 -l 100 \\\n            -u 1e-9 -N 1e6 \\\n            -f rad -c1 CTGCAG \\\n            -dm 5 -ds 5 \\\n            -o $tree\\_rad_covmed/$tree\n            \n    simrrls -mc 0 -ms 0 -t $tree.tre \\\n            -L 1000 -l 100 \\\n            -u 1e-9 -N 1e6 \\\n            -f rad -c1 CTGCAG \\\n            -dm 2 -ds 5 \\\n            -o $tree\\_rad_covlow/$tree\ndone\n')


# # Assemble data sets in _pyRAD_

# In[22]:


get_ipython().run_cell_magic('bash', '', 'pyrad --version\n')


# In[25]:


get_ipython().run_cell_magic('bash', '', '## new params file (remove existing file if present)\nrm params.txt\npyrad -n\n')


# In[26]:


get_ipython().run_cell_magic('bash', '', "## enter parameters into params file using sed\nsed -i '/## 1. /c\\Tbal_rad_drop       ## 1. working dir ' params.txt\nsed -i '/## 2. /c\\Tbal_rad_drop/*.gz  ## 2. data loc ' params.txt\nsed -i '/## 3. /c\\Tbal_rad_drop/*barcodes.txt  ## 3. Bcode ' params.txt\nsed -i '/## 6. /c\\TGCAG,AATT          ## 6. cutters ' params.txt\nsed -i '/## 7. /c\\20                  ## 7. Nproc ' params.txt\nsed -i '/## 10. /c\\.82                ## 10. clust thresh' params.txt\nsed -i '/## 11. /c\\rad                ## 11. datatype ' params.txt\nsed -i '/## 12. /c\\2                  ## 12. minCov ' params.txt\nsed -i '/## 13. /c\\10                  ## 13. maxSH' params.txt\nsed -i '/## 14. /c\\Tbal               ## 14. outname' params.txt\nsed -i '/## 24./c\\99                  ## 24. maxH' params.txt\nsed -i '/## 30./c\\n,p,s               ## 30. out format' params.txt\n")


# In[ ]:


## IPython code to iterate over trees and coverages and run pyrad
## sometimes this freezes when run in a jupyter notebook due 
## to problems with multiprocessing in notebooks. This is why my new
## work with ipyrad uses ipyparallel instead of multiprocessing. 

for tree in ['Tbal', 'Timb', 'Tvib']:
    for dtype in ['rad', 'ddrad']:
        with open('params.txt', 'rb') as params:
            pp = params.readlines()
            pp[1] = "{}_{}_drop                 ## 1. \n".format(tree, dtype)
            pp[2] = "{}_{}_drop/*.gz            ## 2. \n".format(tree, dtype)
            pp[3] = "{}_{}_drop/*barcodes.txt   ## 3. \n".format(tree, dtype)
            pp[14] = "{}                        ## 14. \n".format(tree)
        with open('params.txt', 'wb') as params:
            params.write("".join(pp))
        ## this calls pyrad as a bash script
        get_ipython().system(' pyrad -p params.txt >> log.txt 2>&1')
          
    for cov in ['full', 'med', 'low']:
        with open('params.txt', 'rb') as params:
            pp = params.readlines()
            pp[1] = "{}_rad_cov{}                ## 1. \n".format(tree, cov)
            pp[2] = "{}_rad_cov{}/*.gz           ## 2. \n".format(tree, cov)
            pp[3] = "{}_rad_cov{}/*barcodes.txt  ## 3. \n".format(tree, cov)
            pp[14] = "{}                         ## 14. \n".format(tree)     
        with open('params.txt', 'wb') as params:
            params.write("".join(pp))    
        ## this calls pyrad as a bash script            
        get_ipython().system(' pyrad -p params.txt >> log.txt 2>&1')


# # Visualize data sharing on these trees

# In[30]:


def getarray(locifile, tree):
    """ parse the loci list and return a 
        presence/absence matrix ordered by 
        the tips on the tree"""
    ## parse the loci file
    loci = open(locifile).read().split("\n//")[:-1]
    ## order (ladderize) the tree
    tree.ladderize()
    ## get tip names
    names = tree.get_leaf_names()
    ## make empty matrix
    lxs = np.zeros((len(names), len(loci)))
    ## fill the matrix
    for loc in xrange(len(loci)):
        for seq in loci[loc].split("\n"):
            if ">" in seq:
                lxs[names.index(seq.split()[0][1:].rsplit("_", 1)[0]),loc] += 1
    return lxs


# In[31]:


def countmatrix(lxsabove, lxsbelow, max=0):
    """ fill a matrix with pairwise data sharing
        between each pair of samples. You could put
        in two different 'share' matrices to have
        different results above and below the diagonal.
        Can enter a max value to limit fill along diagonal.
        """
    share = np.zeros((lxsabove.shape[0], 
                      lxsbelow.shape[0]))
    ## fill above
    names = range(lxsabove.shape[0])
    for row in lxsabove:
        for samp1,samp2 in itertools.combinations(names,2):
            shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
            share[samp1,samp2] = shared
    ## fill below
    for row in lxsbelow:
        for samp2,samp1 in itertools.combinations(names,2):
            shared = lxsabove[samp1, lxsabove[samp2,]>0].sum()
            share[samp1,samp2] = shared
    ## fill diagonal
    if not max:
        for row in range(len(names)):
            share[row,row] = lxsabove[row,].sum()
    else:
        for row in range(len(names)):
            share[row,row] = max
    return share


# In[32]:


def plotSVGmatrix(share, outname):
    surf = plt.pcolormesh(share, cmap="gist_yarg")
    dims = plt.axis('image')
    surf.axes.get_xaxis().set_ticklabels([])
    surf.axes.get_xaxis().set_ticks([])
    surf.axes.get_yaxis().set_ticklabels([])
    surf.axes.get_yaxis().set_ticks([])
    ax = plt.gca()
    ax.invert_yaxis()
    plt.colorbar(surf, aspect=15)
    if outname:
        plt.savefig(outname+".svg")
    
    
def fullplot(locifile, tree, outname=None):
    lxsB = getarray(locifile, tree)
    share = countmatrix(lxsB, lxsB)
    plotSVGmatrix(share, outname)


# In[41]:


fullplot('Tbal_rad_drop/outfiles/Tbal.loci', Tbal, 'Tbal_drop')


# In[42]:


fullplot('Tbal_rad_covlow/outfiles/Tbal.loci', Tbal, 'Tbal_covlow')


# In[43]:


fullplot('Tbal_rad_covmed/outfiles/Tbal.loci', Tbal, 'Tbal_covmed')


# In[44]:


fullplot('Tbal_rad_covfull/outfiles/Tbal.loci', Tbal, 'Tbal_covfull')


# In[45]:


fullplot('Timb_rad_drop/outfiles/Timb.loci', Timb, 'Timb_drop')


# In[46]:


fullplot('Tvib_rad_drop/outfiles/Tvib.loci', Tvib, 'Tvib_drop')


# In[47]:


fullplot('Tvib_rad_covlow/outfiles/Tvib.loci', Tvib, 'Tvib_covlow')


# # The hierarchical distribution of informative sites
# First we re-calculate the pair-wise data sharing matrices for all species in each data set. 

# In[49]:


lxs_Tbal_droprad = getarray("Tbal_rad_drop/outfiles/Tbal.loci", Tbal)
lxs_Tbal_dropddrad = getarray("Tbal_ddrad_drop/outfiles/Tbal.loci", Tbal)
lxs_Tbal_covlow = getarray("Tbal_rad_covlow/outfiles/Tbal.loci", Tbal)
lxs_Tbal_covmed = getarray("Tbal_rad_covmed/outfiles/Tbal.loci", Tbal)
lxs_Tbal_covfull = getarray("Tbal_rad_covfull/outfiles/Tbal.loci", Tbal)

lxs_Timb_droprad = getarray("Timb_rad_drop/outfiles/Timb.loci", Timb)
lxs_Timb_dropddrad = getarray("Timb_ddrad_drop/outfiles/Timb.loci", Timb)
lxs_Timb_covlow = getarray("Timb_rad_covlow/outfiles/Timb.loci", Timb)
lxs_Timb_covmed = getarray("Timb_rad_covmed/outfiles/Timb.loci", Timb)
lxs_Timb_covfull = getarray("Timb_rad_covfull/outfiles/Timb.loci", Timb)

lxs_Tvib_droprad = getarray("Tvib_rad_drop/outfiles/Tvib.loci", Tvib)
lxs_Tvib_dropddrad = getarray("Tvib_ddrad_drop/outfiles/Tvib.loci", Tvib)
lxs_Tvib_covlow = getarray("Tvib_rad_covlow/outfiles/Tvib.loci", Tvib)
lxs_Tvib_covmed = getarray("Tvib_rad_covmed/outfiles/Tvib.loci", Tvib)
lxs_Tvib_covfull = getarray("Tvib_rad_covfull/outfiles/Tvib.loci", Tvib)


# #### A function to count loci for each bipartition (quartet-style)

# In[50]:


def count_inf4(tree, matrix, node):
    """ count the number of loci with data spanning 
    a given node in the tree """
    
    ## get children of selected node
    a, b = node.get_children()
    
    ## get tip descendents of a and b
    tips_a = set(a.get_leaf_names())
    tips_b = set(b.get_leaf_names())
    
    ## get every other tip (outgroups)
    upone = node.up
    if upone.is_root():
        ch = upone.children
        sis = [i for i in ch if i != node][0]
        if sis.children:
            tips_c = sis.children[0].get_leaf_names()
            tips_d = sis.children[1].get_leaf_names()
        else:
            return 0
    else:
        upone = set(node.up.get_leaf_names())
        tips_c = upone - tips_a - tips_b
        tips_all = set(tree.get_leaf_names())
        tips_d = tips_all - tips_a - tips_b - tips_c
    
    ## get indices in matrix for leaf tips
    names = tree.get_leaf_names()
    index_a = [names.index(i) for i in tips_a]
    index_b = [names.index(i) for i in tips_b]
    index_c = [names.index(i) for i in tips_c]
    index_d = [names.index(i) for i in tips_d]
   
    ## how man loci are "informative"
    inf = 0
    for col in matrix.T:
        hits_a = sum([col[i] for i in index_a])
        hits_b = sum([col[i] for i in index_b])
        hits_c = sum([col[i] for i in index_c])
        hits_d = sum([col[i] for i in index_d])

        if all([hits_a, hits_b, hits_c, hits_d]):
                inf += 1 
    return inf    


# #### A function to write data to file for plotting
# Here I iterate over each node and apply _count\_inf4_ which returns the number of loci that are informative for the subtending bipartition, and _count\_snps_ which counts snps segregating at that bipartition. This takes a few minutes to run.

# In[51]:


def nodes_dat(tree, lxs, datfilename):
    dat = []
    for node in tree.traverse():
        if not (node.is_leaf() or node.is_root()):
            loci = count_inf4(tree, lxs, node)
            dist = round(tree.get_distance(node),2)
            dat.append([dist, loci])
            node.name = "%d" % loci
    
    ## print tree with bls & node labels
    tree.write(format=3,outfile=datfilename+".tre")
    
    ## print data to file
    with open(datfilename, 'w') as outfile:
        np.savetxt(outfile, np.array(dat), fmt="%.2f")


# ## Make data files

# In[52]:


get_ipython().run_cell_magic('bash', '', '## a new directory to store the data in\nmkdir -p analysis_counts2\n')


# In[53]:


nodes_dat(Tbal, lxs_Tbal_droprad, 
          "analysis_counts2/Tbal_droprad.dat3")
nodes_dat(Tbal, lxs_Tbal_dropddrad, 
          "analysis_counts2/Tbal_dropddrad.dat3")
nodes_dat(Tbal, lxs_Tbal_covlow, 
          "analysis_counts2/Tbal_covlow.dat3")    
nodes_dat(Tbal, lxs_Tbal_covmed, 
          "analysis_counts2/Tbal_covmed.dat3")
nodes_dat(Tbal, lxs_Tbal_covfull,
          "analysis_counts2/Tbal_covfull.dat3")


# In[54]:


nodes_dat(Timb, lxs_Timb_droprad, 
          "analysis_counts2/Timb_droprad.dat3")
nodes_dat(Timb, lxs_Timb_dropddrad, 
          "analysis_counts2/Timb_dropddrad.dat3")
nodes_dat(Timb, lxs_Timb_covlow, 
          "analysis_counts2/Timb_covlow.dat3")    
nodes_dat(Timb, lxs_Timb_covmed, 
          "analysis_counts2/Timb_covmed.dat3")
nodes_dat(Timb, lxs_Timb_covfull,
          "analysis_counts2/Timb_covfull.dat3")


# In[55]:


nodes_dat(Tvib, lxs_Tvib_droprad, 
          "analysis_counts2/Tvib_droprad.dat3")  
nodes_dat(Tvib, lxs_Tvib_dropddrad, 
          "analysis_counts2/Tvib_dropddrad.dat3")    
nodes_dat(Tvib, lxs_Tvib_covlow, 
          "analysis_counts2/Tvib_covlow.dat3")    
nodes_dat(Tvib, lxs_Tvib_covmed, 
          "analysis_counts2/Tvib_covmed.dat3")
nodes_dat(Tvib, lxs_Tvib_covfull,
          "analysis_counts2/Tvib_covfull.dat3")


# # Plot the hierarchical distribution with the trees

# In[5]:


get_ipython().run_line_magic('load_ext', 'rpy2.ipython')


# In[6]:


get_ipython().run_cell_magic('R', '', 'library(ape)\n')


# ### Load in the data to R

# In[56]:


get_ipython().run_cell_magic('R', '', '## read in the data and factor results\ndat <- read.table("analysis_counts2/Tbal_droprad.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTbal_droprad_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tbal_dropddrad.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTbal_dropddrad_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tbal_covlow.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTbal_covlow_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tbal_covmed.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTbal_covmed_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tbal_covfull.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTbal_covfull_Lme <- with(dat, tapply(loci, depth, mean))\n')


# In[57]:


get_ipython().run_cell_magic('R', '', '## read in the data and factor results\ndat <- read.table("analysis_counts2/Timb_droprad.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTimb_droprad_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Timb_dropddrad.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTimb_dropddrad_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Timb_covlow.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTimb_covlow_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Timb_covmed.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTimb_covmed_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Timb_covfull.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTimb_covfull_Lme <- with(dat, tapply(loci, depth, mean))\n')


# In[58]:


get_ipython().run_cell_magic('R', '', '## read in the data and factor results\ndat <- read.table("analysis_counts2/Tvib_droprad.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTvib_droprad_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tvib_dropddrad.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTvib_dropddrad_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tvib_covlow.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTvib_covlow_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tvib_covmed.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTvib_covmed_Lme <- with(dat, tapply(loci, depth, mean))\n\ndat <- read.table("analysis_counts2/Tvib_covfull.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nTvib_covfull_Lme <- with(dat, tapply(loci, depth, mean))\n')


# ### Plots

# In[59]:


get_ipython().run_cell_magic('R', '-w 400 -h 400', '\n#svg("box1.svg", width=4, height=5)\n\nL = Tbal_droprad_Lme\nplot(L, xlim=c(0,6), ylim=c(575,1025), \n     cex.axis=1.25, type=\'n\', xaxt="n")\n#abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Tbal_covfull_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Tbal_droprad_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#6E6E6E", bg="#6E6E6E", lwd=0.75)\n\n#L = Tbal_dropddrad_Lme \n#df1 = data.frame(as.numeric(names(L)),as.numeric(L))\n#points(df1, cex=3.5, pch=21, col="#6E6E6E", bg="#D3D3D3", lwd=0.75)\n\nbox()\n\naxis(side=1, at=seq(0,6,0.5), \n     labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)\n\n#dev.off()\n')


# In[60]:


get_ipython().run_cell_magic('R', '-w 400 -h 400', '\n#svg("box2.svg", width=4, height=5)\n\nL = Tbal_covlow_Lme\nplot(L, xlim=c(0,6), ylim=c(575,1025), \n     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Tbal_covfull_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Tbal_covmed_Lme \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\n\n#L = Tbal_covlow_Lme\n#df1 = data.frame(as.numeric(names(L)),as.numeric(L))\n#points(df1, cex=3.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)\n\nbox()\n\naxis(side=1, at=seq(0,6,0.5), \n     labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)\n\n#dev.off()\n')


# In[61]:


get_ipython().run_cell_magic('R', '-w 400 -h 400', '\n#svg("box3.svg", width=4, height=5)\n\n## samples every 6th to make plot more readable\n\nL = Timb_droprad_Lme[c(3:62)]\nplot(L, xlim=c(0,6), ylim=c(575, 1025), \n     cex.axis=1.25, type=\'n\', xaxt="n")#, yaxt="n")\n#abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Timb_covfull_Lme[seq(3, 65, 6)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Timb_droprad_Lme[seq(3, 65, 6)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nbox()\n\naxis(side=1, at=seq(0,6, 0.5), \n     labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)\n\n#dev.off()\n')


# In[62]:


get_ipython().run_cell_magic('R', '-w 400 -h 400', '\n## samples every 6th to make plot more readable\n\n#svg("box4.svg", width=4, height=5)\nL = Timb_droprad_Lme[c(3:62)]\nplot(L, xlim=c(0,6), ylim=c(575, 1025), \n     cex.axis=1.25, type=\'n\', xaxt="n")#, yaxt="n")\n#abline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Timb_covfull_Lme[seq(3, 65, 6)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Timb_covmed_Lme[seq(3, 65, 6)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\nlines(df1, type=\'l\', lwd=2, lty=2)\npoints(df1, cex=2.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nbox()\n\naxis(side=1, at=seq(0,6, 0.5), \n     labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)\n#dev.off()\n')


# In[63]:


get_ipython().run_cell_magic('R', '-w 400 -h 500', '\n#svg("Sourcemissing.svg", height=10, width=8)\n#svg("Sourcemissing.svg", height=7, width=5.33)\n\nmat2 = matrix(c(1,1,1,4,4,4,7,7,7,  \n                1,1,1,4,4,4,7,7,7,\n                2,2,2,5,5,5,8,8,8,\n                3,3,3,6,6,6,9,9,9),\n                4,9, byrow=TRUE)\nlayout(mat2)\npar(mar=c(1,1,0,1), \n    oma=c(2,2,1,0))\n\n#########################################################\ntre <- read.tree("Tbal.tre")\nplot(tre, show.tip.label=F,\n     edge.width=2.5, type=\'p\', \n     x.lim=c(0,6))\n\n####  \nL = Tbal_droprad_Lme\nplot(L, xlim=c(0,6), ylim=c(-25,1200), \n     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Tbal_covfull_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Tbal_droprad_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nL = Tbal_dropddrad_Lme \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)\n\nbox()\n\n####\nL = Tbal_covlow_Lme\nplot(L, xlim=c(0,6), ylim=c(-25,1200), \n     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Tbal_covfull_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Tbal_covmed_Lme \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nL = Tbal_covlow_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)\n\naxis(side=1, at=seq(0,6,0.5), \n     labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)\n\nbox()\n\n##########################################################\ntre <- read.tree("Timb.tre")\nplot(tre, show.tip.label=F,\n     edge.width=2.5, type=\'p\', \n     x.lim=c(0,6))\n\n####\nL = Timb_droprad_Lme[c(3:62)]\nplot(L, xlim=c(0,6), ylim=c(-25,1200), \n     cex.axis=1.25, type=\'n\', xaxt="n", yaxt="n")\nabline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Timb_covfull_Lme[c(3:62)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Timb_droprad_Lme[c(3:62)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nL = Timb_dropddrad_Lme[c(3:62)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)\n\nbox()\n\n####\nL = Timb_covlow_Lme[c(3:62)] \nplot(L, xlim=c(0,6), ylim=c(-25,1200), \n     cex.axis=1.25, type=\'n\', xaxt="n", yaxt="n")\nabline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Timb_covfull_Lme[c(3:62)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Timb_covmed_Lme[c(3:62)]\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nL = Timb_covlow_Lme[c(3:62)] \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)\n\nbox()\n\n##\naxis(side=1, at=seq(0,6,0.5), \n     labels=as.character(seq(6,0,by=-0.5)), cex.axis=1.25)\n\n#########################################################\ntre <- read.tree("Tvib.tre")\nplot(tre, show.tip.label=F,\n     edge.width=2.5, type=\'p\', \n     x.lim=c(0,6))\n\n####\nL = Tvib_droprad_Lme\nplot(L, xlim=c(0,6.25), ylim=c(-25,1200), \n     cex.axis=1.25, type=\'n\', xaxt="n", yaxt="n")\nabline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Tvib_covfull_Lme \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Tvib_droprad_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nL = Tvib_dropddrad_Lme \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)\n\nbox()\n####\n\nplot(L, xlim=c(0,6.25), ylim=c(-25,1200), \n     cex.axis=1.25, type=\'n\', xaxt="n", yaxt="n")\nabline(h=1000, lwd=2, col="#6E6E6E", lty="dotted")\n\nL = Tvib_covfull_Lme \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#262626", lwd=0.75)\n\nL = Tvib_covmed_Lme \ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#6E6E6E", lwd=0.75)\n\nL = Tvib_covlow_Lme\ndf1 = data.frame(as.numeric(names(L)),as.numeric(L))\npoints(df1, cex=1.5, pch=21, col="#262626", bg="#D3D3D3", lwd=0.75)\n\nbox()\n##\naxis(side=1, at=seq(0,6,1), \n     labels=as.character(seq(6,0,by=-1)), cex.axis=1.25)\n\n#dev.off()\n')


# ## Empirical data (full & half depth)
# Here I am grabbing the assembled empirical data from notebook_1 (Viburnum) to compare the effect of sequencing coverage with the results we see when simulating data on that tree. 

# In[64]:


Tvib2 = Tvib.copy()
for node in Tvib2:
    node.name = node.name+"_0"


# In[70]:


## full size data
lxs_EmpVib_full = getarray("/home/deren/Dropbox/RADexplore/EmpVib/vib_full_64tip_c85d6m4p99.loci", Tvib)#, dropind=1)
lxs_EmpVib_half = getarray("/home/deren/Dropbox/RADexplore/EmpVib/vib_half_64tip_c85d6m4p99.loci", Tvib)#, dropind=1)


# In[ ]:


share_full = countmatrix(lxs_EmpVib_full,lxs_EmpVib_full)
plotSVGmatrix(share_full, "EmpVib_full")


# In[ ]:


share_half = countmatrix(lxs_EmpVib_half,lxs_EmpVib_half)
plotSVGmatrix(share_half, "EmpVib_half")


# In[ ]:


nodes_dat(Tvib, lxs_EmpVib_half, 
          "analysis_counts/Tvib_Emp_half.dat3")
nodes_dat(Tvib, lxs_EmpVib_full, 
          "analysis_counts/Tvib_Emp_full.dat3")


# In[ ]:


get_ipython().run_cell_magic('R', '', '## read in the data and factor results\ndat <- read.table("analysis_counts/Tvib_Emp_full.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nEmpVib_full_Lme <- with(dat, tapply(loci, depth, mean))\n\n## read in the data and factor results\ndat <- read.table("analysis_counts/Tvib_Emp_half.dat3", \n                  header=F, col.names=c(\'depth\',\'loci\'))\ndat[,1] <- as.factor(dat[,1])\nEmpVib_half_Lme <- with(dat, tapply(loci, depth, mean))\n')


# In[ ]:


get_ipython().run_cell_magic('R', '', 'EmpVib_half_Lme\n')


# In[ ]:


get_ipython().run_cell_magic('R', '-w 200 -h 400', '\n#svg("EmpVib_fullvhalf3.svg", height=6, width=2.5)\n\nmat2 <- matrix(c(1,1,1,2),byrow=TRUE)\nlayout(mat2)\npar(mar=c(1,1,0,1), \n    oma=c(2,2,1,0))\n\n#########################################################\n#tre <- read.tree("Tvib.tre")\n#plot(tre, show.tip.label=F,\n#     edge.width=2.5, type=\'p\', \n#     x.lim=c(0,2.25))\nVtre <- read.tree("analysis_counts/EmpVib_full.dat3.tre")\nplot(Vtre, cex=0.6, adj=0.05, x.lim=c(0,2.25),\n     edge.width=2.5, type=\'p\',show.tip.label=FALSE)\nnodelabels(pch=20, col="black",\n           cex=as.integer(Vtre$node.label)/7500)\n\n####\nL = EmpVib_half_Lme\nplot(L, xlim=c(0,2.25), ylim=c(-25,50200), \n     cex.axis=1.25, type=\'n\', xaxt="n")\ndf1 = data.frame(as.numeric(names(L)),\n                 as.numeric(L))\npoints(df1, cex=1.25, pch=21, col="red", bg="red")\n\n####\nL = EmpVib_full_Lme\n#plot(L, xlim=c(0,2.25), ylim=c(-25,50200), \n#     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=0, lwd=2, col="gray", lty="dotted")\ndf1 = data.frame(as.numeric(names(L)),\n                 as.numeric(L))\npoints(df1, cex=1.25, pch=21, col="#262626", bg="#262626")\n\n#dev.off()\n')


# In[ ]:


get_ipython().run_cell_magic('R', '-w 200 -h 400', '\n#svg("fullvhalf3.svg", height=4.5, width=4)\n\n####\nL = EmpVib_full_Lme\nplot(L, xlim=c(0,2.25), ylim=c(-25,50200), \n     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=0, lwd=2, col="gray", lty="dotted")\ndf1 = data.frame(as.numeric(names(L)),\n                 as.numeric(L))\npoints(df1, cex=1.25, pch=21, col="#262626", bg="#262626")\n\n\ndev.off()\n\nsvg("fullonly.svg", height=4.5, width=4)\n####\nL = EmpVib_half_Lme\nplot(L, xlim=c(0,2.25), ylim=c(-25,50200), \n     cex.axis=1.25, type=\'n\', xaxt="n")\ndf1 = data.frame(as.numeric(names(L)),\n                 as.numeric(L))\npoints(df1, cex=1.25, pch=21, col="red", bg="red")\n\n####\nL = EmpVib_full_Lme\nabline(h=0, lwd=2, col="gray", lty="dotted")\ndf1 = data.frame(as.numeric(names(L)),\n                 as.numeric(L))\npoints(df1, cex=1.25, pch=21, col="#262626", bg="#262626")\n\n#dev.off()\n')


# In[ ]:


get_ipython().run_cell_magic('R', '', 'data.frame(cbind(median(EmpVib_full_Lme),\n           median(EmpVib_half_Lme)),\n           cbind(mean(EmpVib_full_Lme),\n           mean(EmpVib_half_Lme)),\n     col.names=c("full","half"))\n')


# In[ ]:


get_ipython().run_cell_magic('R', '', 'svg(\'hist.svg\', width=4.25, height=4)\nhist(rnorm(10000), col="grey")\ndev.off()\n')


# In[ ]:


get_ipython().run_cell_magic('R', '-w 300 -h 600', '\nVtre <- read.tree("analysis_counts/EmpVib_full.dat3.tre")\nsvg("EmpVib_full_nodes.svg", height=6, width=3)\nplot(Vtre, cex=0.6, adj=0.05,\n     edge.width=3, show.tip.label=FALSE)\nnodelabels(pch=20, col="black",\n           cex=as.integer(Vtre$node.label)/10000)\ndev.off()\n')


# In[ ]:


# ### plOT

# In[ ]:


get_ipython().run_cell_magic('R', '-w 300 -h 600', '\n#svg("locisnpsdepth.svg", height=8, width=4)\n#pdf("locisnpsdepth.pdf", height=8, width=4)\n\nmat2 <- matrix(c(1,1,1,5,5,5,\n                 1,1,1,5,5,5,\n                 2,2,2,6,6,6,\n                 3,3,3,7,7,7,\n                 4,4,4,8,8,8), \n                 5,6, byrow=TRUE)\nlayout(mat2)\npar(mar=c(1,1,0,1), \n    oma=c(2,2,1,0))\ntre <- read.tree("Tbal.tre")\nplot(tre, show.tip.label=F,\n     edge.width=2.5, type=\'p\', \n     x.lim=c(-0.25,2.75))\n\n##-------------------------------\n## Plot full data locus sharing\nx = seq(1.5,5.5)\ny = Tbal_full_Lme#[1:6]\ns = Tbal_full_Lsd#[1:6]\nplot(x, y, xlim=c(1,7.2), ylim=c(-25,3100), \n     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")\npoints(x,y, cex=1.25, pch=21, col="#262626", bg="#262626")\nlines(x,y, lwd=2, col="#262626")\nsegments(x, y-s, x, y+s, col="#262626")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\nx = seq(1.5,5.5)\ny = Tbal_full_Sme#[2:6]\ns = Tbal_full_Ssd#[2:6]\nlines(x, y, lwd=2, col="darkgrey")\npoints(x, y, cex=1.25, pch=21, col="darkgrey", bg="darkgrey")\nsegments(x, y-s, x, y+s, col="darkgrey")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\n##--------------------------------\n## Plot drop data locus sharing\nx = seq(1.5,5.5)\ny = Tbal_drop_Lme\ns = Tbal_drop_Lsd\nplot(x, y, xlim=c(1,7.2), ylim=c(-25,3100), \n     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")\npoints(x,y, cex=1.25, pch=21, col="#262626", bg="#262626")\nlines(x,y, lwd=2, col="#262626")\nsegments(x, y-s, x, y+s, col="#262626")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\nx = seq(1.5,5.5)\ny = Tbal_drop_Sme\ns = Tbal_drop_Ssd\nlines(x, y, lwd=2, col="darkgrey")\npoints(x, y, cex=1.25, pch=21, col="darkgrey", bg="darkgrey")\nsegments(x, y-s, x, y+s, col="darkgrey")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\n##------------------------------\n## Plot cov data locus sharing\nx = seq(1.5,5.5)\ny = Tbal_cov_Lme\ns = Tbal_cov_Lsd\nplot(x, y, xlim=c(1,7.2), ylim=c(-25,3100), \n     cex.axis=1.25, type=\'n\', xaxt="n")\nabline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")\npoints(x,y, cex=1.25, pch=21, col="#262626", bg="#262626")\nlines(x,y, lwd=2, col="#262626")\nsegments(x, y-s, x, y+s, col="#262626")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\nx = seq(1.5,5.5)\ny = Tbal_cov_Sme\ns = Tbal_cov_Ssd\nlines(x, y, lwd=2, col="darkgrey")\npoints(x, y, cex=1.25, pch=21, col="darkgrey", bg="darkgrey")\nsegments(x, y-s, x, y+s, col="darkgrey")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\naxis(side=1, at=seq(1.1,8.1,1), \n     labels=as.character(seq(3,-0.5,by=-0.5)), cex.axis=1.25)\n\n###########################################\n###########################################\ntre <- read.tree("Timb.tre")\nplot(tre, show.tip.label=F, \n     edge.width=2, type=\'p\')\n##------------------------------------\n## Plot full data locus sharing\nx = seq(2,62)\ny = Timb_full_Lme[2:62]\ns = Timb_full_Lsd[2:62]\nplot(x, y, xlim=c(1,65), ylim=c(-25,3100), \n     cex.axis=1.25, type=\'n\', yaxt="n", xaxt="n")\nabline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")\npoints(x,y, cex=1, pch=21, col="#262626", bg="#262626")\nlines(x,y, lwd=2, col="#262626")\nsegments(x, y-s, x, y+s, col="#262626")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\ny = Timb_full_Sme[2:62]\ns = Timb_full_Ssd[2:62]\nlines(x, y, lwd=2, col="darkgrey")\n#points(x, y, cex=1, pch=21, col="darkgrey", bg="darkgrey")\nsegments(x, y-s, x, y+s, col="darkgrey")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\n##----------------------------------\n## Plot full data locus sharing\nx = seq(2,62)\ny = Timb_drop_Lme[2:62]\ns = Timb_drop_Lsd[2:62]\nplot(x, y, xlim=c(1,65), ylim=c(-25,3100), \n     cex.axis=1.25, type=\'n\', yaxt="n", xaxt="n")\nabline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")\npoints(x,y, cex=1, pch=21, col="#262626", bg="#262626")\nlines(x,y, lwd=2, col="#262626")\nsegments(x, y-s, x, y+s, col="#262626")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\ny = Timb_drop_Sme[2:62]\ns = Timb_drop_Ssd[2:62]\nlines(x, y, lwd=2, col="darkgrey")\n#points(x, y, cex=1, pch=21, col="darkgrey", bg="darkgrey")\nsegments(x, y-s, x, y+s)\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\n##-----------------------------------\n## Plot drop data locus sharing\nx = seq(2,62)\ny = Timb_cov_Lme[2:62]\ns = Timb_cov_Lsd[2:62]\nplot(x, y,\n     xlim=c(1,65), ylim=c(-20,3100), \n     cex=1, cex.axis=1.25,\n     pch=21, bg="#262626", xaxt="n", yaxt="n")\nabline(h=(seq(0,1000,200)), lwd=1.5, col="gray", lty="dotted")\npoints(x,y, cex=1, pch=21, col="#262626", bg="#262626")\nlines(x,y, lwd=2, col="#262626")\nsegments(x, y-s, x, y+s, col="#262626")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\n\ny = Timb_cov_Sme[2:62]\ns = Timb_cov_Ssd[2:62]\nlines(x, y, lwd=2, col="darkgrey")\n#points(x, y, cex=1, pch=21, col="darkgrey", bg="darkgrey")\nsegments(x, y-s, x, y+s, col="darkgrey")\nepsilon = 0.02\nsegments(x-epsilon,y-s,x+epsilon,y-s)\nsegments(x-epsilon,y+s,x+epsilon,y+s)\naxis(side=1, at=seq(2.1,72,10), \n     labels=as.character(seq(3,0,by=-0.5)), cex.axis=1.25)\n\n#dev.off()\n')


# ### PLOt nodes on tree

# In[ ]:


lxs_EmpVib_full


# In[ ]:


def write_nodes_to_tree(tree,lxs,treefilename):
    for node in tree.traverse():
        if not node.is_leaf():
            inf = count_inf4(tree, lxs, node)
            node.name = "%d" % inf
    
    ## print tree with bls & node labels
    tree.write(format=3,outfile=treefilename)


# In[ ]:


write_nodes_to_tree(Tvib, lxs_EmpVib_full, "Tvib_full_nodes.tre")


# In[ ]:


get_ipython().run_cell_magic('R', '-w 400 -h 500', 'tre <- read.tree("loci_Tbal_cov")\nplot(tre)#, show.tip.label=F, edge.width=2.5)\n#nodelabels(pch=21, \n#           bg="#262626",\n#           cex=as.integer(tre$node.label)/150)\nnodelabels(tre$node.label, bg=\'grey\', cex=1.5)\n')


# ## Data sharing by sub-sampling

# How much data are shared by a random N samples, and how much data are shared across the deepest bipartition for 2+N samples. _Also how many SNPs?_

# In[ ]:


def counts(lxs, minr, maxr, maxi):
    ## data store
    data = np.zeros((maxr+1-minr,maxi))
    
    for sample in range(minr, maxr+1):
        g = itertools.combinations(range(maxr), sample)
        i = 0
        while i<maxi:
            try: 
                gsamp = g.next()
            except StopIteration:
                break
            shared = sum(lxs[gsamp,:].sum(axis=0) == len(gsamp))
            data[sample-minr,i] = shared
            i += 1
    return data


# In[ ]:


Dbal = counts(lxs_Tbal_drop, 4, 32, 1000)
Dimb = counts(lxs_Timb_drop, 4, 32, 1000)


# In[ ]:


Dbal


# In[ ]:


def counts2(lxs, minr, maxr, maxi):
    ## data store
    data = np.zeros(((maxr+1-minr)*maxi, 3))
    count = 0
    for sample in range(minr, maxr+1):
        g = itertools.combinations(range(maxr), sample)
        i = 0
        while i<maxi:
            try: 
                gsamp = g.next()
            except StopIteration:
                break
            shared = sum(lxs[gsamp,:].sum(axis=0) == len(gsamp))
            datum = [sample, float(shared), i+1]
            data[count] = datum
            i += 1
            count += 1
    return data


# In[ ]:


Dimb1 = counts2(lxs_Timb_full, 4, 32, 100)
Dimb2 = counts2(lxs_Timb_drop, 4, 32, 100)
Dimb3 = counts2(lxs_Timb_cov, 4, 32, 100)


# In[ ]:


Dbal1 = counts2(lxs_Tbal_full, 4, 32, 100)
Dbal2 = counts2(lxs_Tbal_drop, 4, 32, 100)
Dbal3 = counts2(lxs_Tbal_cov, 4, 32, 100)


# In[ ]:


def saveto(D, outname):
    dd = pd.DataFrame({"time":[i[0] for i in D],
                       "loci":[i[1] for i in D],
                       "idx":[i[2] for i in D],
                       "tree":["Timb" for _ in D]})
    dd.to_csv(outname)


# In[ ]:


saveto(Dimb1, "Dimb1.dat")
saveto(Dimb2, "Dimb2.dat")
saveto(Dimb3, "Dimb3.dat")
saveto(Dbal1, "Dbal1.dat")
saveto(Dbal2, "Dbal2.dat")
saveto(Dbal3, "Dbal3.dat")