#!/usr/bin/env python
# coding: utf-8

# ## Simulate RAD-seq data
# The simulations software simrrls is available at [github.com/dereneaton/simrrls](github.com/dereneaton/simrrls). First we create a directory called `ipsimdata/` and then simulate data and put it in this directory. Simrrls has a few dependencies that are required. 

# ### Global variables

# In[52]:


## name for our sim data directory
DIR = "./ipsimdata"

## A mouse MT genome used to stick our data into.
INPUT_CHR = "/home/deren/Downloads/MusMT.fa"

## number of RAD loci to simulate
NLOCI = 1000

## number of inserts to reference genome and insert size
N_INSERTS = 100
INSERT_SIZE = 50


# ### Set up / clean up directories

# In[53]:


import os
import shutil

## rm sim dir if it exists, else create it.
while 1:
    if os.path.exists(DIR):
        shutil.rmtree(DIR)
    else:
        os.mkdir(DIR)
    ## make sure dir is finished removing
    if os.path.exists(DIR):
        break

## rm testdirs if they exist
TESTDIRS = ["./testref1", "./testref2", "./testref3", "./testref4"]
for testdir in TESTDIRS:
    if os.path.exists(testdir):
        shutil.rmtree(testdir)


# ### Simulate the RAD data

# In[54]:


import simrrls
print 'simrrls', simrrls.__version__


# In[55]:


import subprocess

## this the bash command-line call to simrrls
cmd = """\
simrrls -o {odir}/{oname} -f {form} -dm 10 -ds 2 
        -I 0.01 -L {nloci} -i1 {imin} -i2 {imax} 
"""
        
## simulate rad_example (includes indels)
call = cmd.format(odir=DIR, oname="rad_example", form="rad", 
                  imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())

## simulate gbs_example (includes indels)
call = cmd.format(odir=DIR, oname="gbs_example", form="gbs", 
                  imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())

## simulate pairddrad_example (includes indels)
call = cmd.format(odir=DIR, oname="pairddrad_example", form="pairddrad", 
                  imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())

## simulate gbs_example (includes indels)
call = cmd.format(odir=DIR, oname="pairgbs_example", form="pairgbs",
                  imin=50, imax=100, nloci=NLOCI)
print call
subprocess.check_output(call.split())

## simulate pairddrad_example (includes indels and merged reads)
call = cmd.format(odir=DIR, oname="pairddrad_wmerge_example", form="pairddrad", 
                  imin=-50, imax=50, nloci=NLOCI)
print call
subprocess.check_output(call.split())

## simulate gbs_example (includes indels and merged reads)
call = cmd.format(odir=DIR, oname="pairgbs_wmerge_example", form="pairgbs", 
                  imin=-50, imax=50, nloci=NLOCI)
print call
subprocess.check_output(call.split())


# ## Manufacture a psuedo-genome with hits from sim data
# 
# These functions take simulated rad-data and a "large" input genome (really it could just be a randomly generated fasta), and randomly inserts a handful of simulated rad tags into the genome. This guarantees that reference mapping will actually do something. For PE simulated data R2 reads are reversed before they're inserted, because smalt is using the `-l pe` flag, which looks for reads in this orientation __`--> <--`__. Also, for PE inner mate distance is fixed at 50. If you wanna get ambitious you could draw this value from a distribution, but seems like more effort than it's worth. This wants to be run from __`ipsimdata/`__, but you can run it from anywhere if you update the paths.
# 

# In[56]:


import itertools
import gzip
import random
from Bio import SeqIO


# ## Function to insert reads into simulated genome

# In[57]:


## Utility function
def revcomp(sequence):
    "returns reverse complement of a string"
    sequence = sequence[::-1].strip()\
                             .replace("A", "t")\
                             .replace("T", "a")\
                             .replace("C", "g")\
                             .replace("G", "c").upper()
    return sequence


# In[58]:


def RAD_to_genome(R1s, R2s, n_inserts, insert_sz, input_chr, out_chr):
    """ 
    Writes simulated rad data into a genome fasta file. 
    Assumes RAD data file has names formatted like the 
    output from simrrls.
    """
    ## read in the full genome file
    record = SeqIO.read(input_chr, "fasta")
    lenchr = len(record.seq)

    ## read in the RAD data files
    dat1 = gzip.open(R1s, 'r')
    qiter1 = itertools.izip(*[iter(dat1)]*4)
    if R2s:
        dat2 = gzip.open(R2s, 'r')
        qiter2 = itertools.izip(*[iter(dat2)]*4)
    else:
        qiter2 = itertools.izip(*[iter(str, 1)]*4)
       
    ## sample unique reads from rads
    uniqs = []
    locid = 0
    while len(uniqs) < n_inserts:
        ## grab a read and get locus id
        qrt1 = qiter1.next()
        qrt2 = qiter2.next()
        iloc = []
        ilocid = int(qrt1[0].split("_")[1][5:])
        ## go until end of locus copies
        while ilocid == locid:
            iloc.append([qrt1[1].strip(), qrt2[1].strip()])
            qrt1 = qiter1.next()
            qrt2 = qiter2.next()
            ilocid = int(qrt1[0].split("_")[1][5:])
        ## sample one read
        uniqs.append(random.sample(iloc, 1)[0])
        locid += 1
        
    ## insert RADs into genome
    sloc = 100
    for ins in range(n_inserts):   
        ## get read, we leave the barcode on cuz it won't hurt
        r1 = uniqs[ins][0]
        r2 = uniqs[ins][1]
        if not r2:
            record.seq = record.seq[:sloc]+r1+\
                         record.seq[sloc:]
        else: 
            record.seq = record.seq[:sloc]+r1+\
                         record.seq[sloc:sloc+insert_sz]+\
                         revcomp(r2)+\
                         record.seq[sloc+insert_sz:]          
        sloc += 300
            
    ## write to file
    rlen = len(qrt1[1].strip())
    if r2: 
        rlen *= 2
    print("input genome is {} bp".format(lenchr))
    print('imputed {} loci {} bp in len'.format(n_inserts, rlen))
    print("new pseudo-genome is {} bp".format(len(record.seq)))
    output_handle = open(out_chr, "w")
    SeqIO.write(record, output_handle, "fasta")
    output_handle.close()


# ## Make pseudo-ref data files

# In[59]:


## SE RAD data
DATA_R1 = DIR+"/rad_example_R1_.fastq.gz"
OUTPUT_CHR = DIR+"/rad_example_genome.fa"
RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)


# In[60]:


## SE GBS data
DATA_R1 = DIR+"/gbs_example_R1_.fastq.gz"
OUTPUT_CHR = DIR+"/gbs_example_genome.fa"
RAD_to_genome(DATA_R1, 0, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)


# In[61]:


## PAIR ddRAD data 
DATA_R1 = DIR+"/pairddrad_wmerge_example_R1_.fastq.gz"
DATA_R2 = DIR+"/pairddrad_wmerge_example_R2_.fastq.gz"
OUTPUT_CHR = DIR+"/pairddrad_wmerge_example_genome.fa"
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)


# In[62]:


## PAIR GBS data
DATA_R1 = DIR+"/pairgbs_wmerge_example_R1_.fastq.gz"
DATA_R2 = DIR+"/pairgbs_wmerge_example_R2_.fastq.gz"
OUTPUT_CHR = DIR+"/pairgbs_wmerge_example_genome.fa"
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, INSERT_SIZE, INPUT_CHR, OUTPUT_CHR)


# ## Run tests
# ### rad_example

# In[63]:


import ipyrad as ip

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref1")
data1.set_params(2, DIR+'/rad_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/rad_example_barcodes.txt')

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/rad_example_genome.fa')

## assemble both
data1.run(force=True)
data2.run(force=True)

## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS


# ### gbs example

# In[64]:


import ipyrad as ip

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref2")
data1.set_params(2, DIR+'/gbs_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/gbs_example_barcodes.txt')

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/gbs_example_genome.fa')

## assemble both
data1.run(force=True)
data2.run(force=True)

## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS


# ### pairddrad example

# In[65]:


import ipyrad as ip

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref3")
data1.set_params(2, DIR+'/pairddrad_wmerge_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/pairddrad_wmerge_example_barcodes.txt')

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/pairddrad_wmerge_example_genome.fa')

## assemble both
data1.run(force=True)
data2.run(force=True)

## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS


# In[66]:


import ipyrad as ip

## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "testref4")
data1.set_params(2, DIR+'/pairgbs_wmerge_example_R1_.fastq.gz')
data1.set_params(3, DIR+'/pairgbs_wmerge_example_barcodes.txt')

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, DIR+'/pairgbs_wmerge_example_genome.fa')

## assemble both
data1.run(force=True)
data2.run(force=True)

## check results
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS


# ### Clean up test dirs

# In[67]:


import glob
import os

## rm dir if it exists, else create it.
for tdir in glob.glob("testref[1-9]"):
    shutil.rmtree(tdir)
        
## rm reference index
for iref in glob.glob(DIR+"/*_genome.fa.*"):
    os.remove(iref)


# ### Create zipped simdata archive

# In[68]:


get_ipython().run_cell_magic('bash', '', '## compressed dir/ w/ all data files\ntar -zcvf ipsimdata.tar.gz ipsimdata/* \n')