#!/usr/bin/env python
# coding: utf-8

# # Simulated RAD-seq data for ipyrad testing
# The simulations software simrrls is available at [github.com/dereneaton/simrrls](github.com/dereneaton/simrrls). 
# 
# 

# In[1]:


import os
import glob
import gzip
import itertools
import numpy as np
import simrrls
import ipyrad as ip

print "simrrls v.{}".format(simrrls.__version__)
print "ipyrad v.{}".format(ip.__version__)


# ### Organization

# In[31]:


## the dir to write data to
DIR = "./ipsimdata/"

## if it doesn't exist make it
if not os.path.exists(DIR):
    os.makedirs(DIR)
    
## if theres anything in it, remove it
for oldfile in glob.glob(os.path.join(DIR, "*")):
    os.remove(oldfile)
    

# ## Simulate the data with simrrls

# In[32]:


get_ipython().run_cell_magic('bash', '-s $DIR', '\n## simulate rad_example (includes indels)\nsimrrls -o $1/rad_example -f rad -dm 10 -ds 2 -I 0.05 -L 1000\n\n## simulate larger rad data set\n#simrrls -o $1/rad_large -f rad -dm 10 -ds 2 -I 0.01 -L 10000\n\n## simulate pairddrad_example\nsimrrls -o $1/gbs_example -f gbs -dm 10 -ds 2 -I 0.05 -L 1000\n\n## simulate pairddrad_example\nsimrrls -o $1/pairddrad_example -f pairddrad -dm 10 -ds 2 -L 1000\n\n## simulate pairgbs_example\nsimrrls -o $1/pairgbs_example -f pairgbs -dm 10 -ds 2 -L 1000\n\n## simulate pairddrad_wmerge_example \nsimrrls -o $1/pairddrad_wmerge_example -f pairddrad -dm 10 -ds 2 \\\n                                         -i1 -50 -i2 100 -L 1000\n\n## simulate pairgbs_wmerge_example (mixture of merged and non-merged reads)\nsimrrls -o $1/pairgbs_wmerge_example -f pairgbs -dm 10 -ds 2 \\\n                                       -i1 -50 -i2 100 -L 1000\n')


# ## Functions to insert reads into simulated genome

# In[33]:


import itertools
import gzip
import numpy as np


# In[34]:


## Utility function
def revcomp(sequence):
    "returns reverse complement of a string"
    sequence = sequence[::-1].strip()\
                             .replace("A", "t")\
                             .replace("T", "a")\
                             .replace("C", "g")\
                             .replace("G", "c").upper()
    return sequence


# In[35]:


def RAD_to_genome(R1s, R2s, n_inserts, insert_low, insert_high, out_chr):
    """ 
    Create a simulated genome file with RAD data interspersed.
    """
    
    ## start genome fasta sequence with 5000 random bp
    genome = np.random.choice(list("ATGC"), 5000)
    
    ## read in the RAD data files
    dat1 = gzip.open(R1s, 'r')
    qiter1 = itertools.izip(*[iter(dat1)]*4)
    if R2s:
        dat2 = gzip.open(R2s, 'r')
        qiter2 = itertools.izip(*[iter(dat2)]*4)
    else:
        qiter2 = itertools.izip(*[iter(str, 1)]*4)
       
    ## sample unique reads from rads
    uniqs = []
    locid = 0
    while len(uniqs) < n_inserts:
        ## grab a read and get locus id
        qrt1 = qiter1.next()
        qrt2 = qiter2.next()
        iloc = []
        ilocid = int(qrt1[0].split("_")[1][5:])
        
        ## go until end of locus copies
        while ilocid == locid:
            iloc.append([qrt1[1].strip(), qrt2[1].strip()])
            qrt1 = qiter1.next()
            qrt2 = qiter2.next()
            ilocid = int(qrt1[0].split("_")[1][5:])
        
        ## sample one read
        uniqs.append(iloc[0])
        locid += 1
        
    ## insert RADs into genome
    for ins in range(n_inserts):   
        ## get read, we leave the barcode on cuz it won't hurt
        r1 = np.array(list(uniqs[ins][0]))
        r2 = np.array(list(revcomp(uniqs[ins][1])))
        
        ## add R1 to genome
        genome = np.concatenate((genome, r1), axis=0)
        if qrt2[0]:
            ## add insert size
            howlong = np.random.uniform(insert_low, insert_high)
            chunk = np.random.choice(list("ATGC"), int(howlong))
            #print howlong
            genome = np.concatenate((genome, chunk), axis=0)
            ## add read2
            genome = np.concatenate((genome, r2), axis=0)

        ## add spacer between loci
        genome = np.concatenate((genome, np.random.choice(list("ATGC"), 5000)), axis=0)

    print("input genome is {} bp".format(genome.shape[0]))
    print("inserted {} loci into genome".format(n_inserts))

    with open(out_chr, "w") as fasta:
        header = ">MT dna:chromosome chromosome:test:MT:1:{}:1 REF\n"\
                 .format(genome.shape[0])
        fasta.write(header)
        outseq = list(genome)
        for i in range(0, len(outseq), 70):
            fasta.write("{}\n".format("".join(outseq[i:i+70])))


# ## Make 'genome' data files

# In[36]:


## Meta args
N_INSERTS = 500
INSERT_LOW = 250
INSERT_HIGH = 300


# In[37]:


## SE RAD data
DATA_R1 = os.path.join(DIR, "rad_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "rad_example_genome.fa")
big = RAD_to_genome(DATA_R1, 0, N_INSERTS, 
                    INSERT_LOW, INSERT_HIGH,
                    OUTPUT_CHR)


# In[38]:


## SE GBS data
DATA_R1 = os.path.join(DIR, "gbs_example_R1_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "gbs_example_genome.fa")
RAD_to_genome(DATA_R1, 0, N_INSERTS,
              INSERT_LOW, INSERT_HIGH, 
              OUTPUT_CHR)


# In[39]:


## PAIR ddRAD data 
DATA_R1 = os.path.join(DIR, "pairddrad_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, 
              INSERT_LOW, INSERT_HIGH,
              OUTPUT_CHR)


# In[40]:


## PAIR ddRAD data 
DATA_R1 = os.path.join(DIR, "pairddrad_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairddrad_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairddrad_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, 
              INSERT_LOW, INSERT_HIGH,  
              OUTPUT_CHR)


# In[41]:


## PAIR GBS data
DATA_R1 = os.path.join(DIR, "pairgbs_wmerge_example_R1_.fastq.gz")
DATA_R2 = os.path.join(DIR, "pairgbs_wmerge_example_R2_.fastq.gz")
OUTPUT_CHR = os.path.join(DIR, "pairgbs_wmerge_example_genome.fa")
RAD_to_genome(DATA_R1, DATA_R2, N_INSERTS, 
              INSERT_LOW, INSERT_HIGH, 
              OUTPUT_CHR)


# ## Run tests in ipyrad
# ### rad_example

# In[43]:


## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests1")
data1.set_params(2, os.path.join(DIR, 'rad_example_R1_.fastq.gz'))
data1.set_params(3, os.path.join(DIR, 'rad_example_barcodes.txt'))
data1.set_params("max_alleles_consens", 4) ## raise this for now 

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, 'reference')
data2.set_params(6, os.path.join(DIR, 'rad_example_genome.fa'))

## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)


# In[45]:


NLOCI = 1000

## check results
assert all(data1.stats.clusters_hidepth == NLOCI)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == NLOCI

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == NLOCI)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == NLOCI

assert all(data4.stats.clusters_hidepth == NLOCI-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == NLOCI-N_INSERTS


# ### gbs example

# In[38]:


## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests2")
data1.set_params(2, os.path.join(DIR, "gbs_example_R1_.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "gbs_example_barcodes.txt"))
data1.set_params("max_alleles_consens", 4) ## raise this for now 

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "gbs_example_genome.fa"))

## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)


# In[39]:


## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS


# ### pairddrad without merged reads example

# In[52]:


## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests3")
data1.set_params(2, os.path.join(DIR, "pairddrad_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairddrad_example_barcodes.txt"))
data1.set_params(7, "pairddrad")
data1.set_params(8, ("TGCAG", "CCGG"))

## branch into an assembly for reference
data2 = data1.branch("reference")
data2.set_params(5, "reference")
data2.set_params(6, os.path.join(DIR, "pairddrad_example_genome.fa"))

## branch into an assembly for denovo+reference
data3 = data2.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data2.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)


# In[54]:


## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS


# In[62]:


data2.stats_dfs


# ### pairgbs w/ merged reads example

# In[ ]:


## create an assembly for denovo
data1 = ip.Assembly("denovo")
data1.set_params(1, "tests4")
data1.set_params(2, os.path.join(DIR, "pairgbs_wmerge_example_*.fastq.gz"))
data1.set_params(3, os.path.join(DIR, "pairgbs_wmerge_example_barcodes.txt"))
data1.set_params(7, "pairgbs")
data1.set_params(8, ("TGCAG", "TGCAG"))

##...


## branch into an assembly for denovo+reference
data3 = data1.branch("denovo-plus")
data3.set_params(5, 'denovo+reference')

## branch into an assembly for reference
data4 = data1.branch("denovo-minus")
data4.set_params(5, 'denovo-reference')

## assemble both
data1.run("1234567", force=True)
data2.run("1234567", force=True)
data3.run("1234567", force=True)
data4.run("1234567", force=True)


# In[ ]:


## check results
assert all(data1.stats.clusters_hidepth == 1000)
assert data1.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data2.stats.clusters_hidepth == N_INSERTS)
assert data2.stats_dfs.s7_loci.sum_coverage.max() == N_INSERTS

assert all(data3.stats.clusters_hidepth == 1000)
assert data3.stats_dfs.s7_loci.sum_coverage.max() == 1000

assert all(data4.stats.clusters_hidepth == 1000-N_INSERTS)
assert data4.stats_dfs.s7_loci.sum_coverage.max() == 1000-N_INSERTS


# ## Create zipped archive of sim data

# In[63]:


get_ipython().run_cell_magic('bash', '-s $DIR', '## compressed dir/ w/ all data files\ntar -zcvf ipsimdata.tar.gz $1 \n')