#!/usr/bin/env python
# coding: utf-8

# # PE ddRAD data analysis with denovo and reference based assembly
# Data from "Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant"
# 
# * http://onlinelibrary.wiley.com/doi/10.1002/ece3.3854/full?wol1URL=/doi/10.1002/ece3.3854/full&regionCode=US-NY&identityKey=34b23ec9-4666-4c37-8470-8448e64d6167
# 
# ## Other PE ddRAD datasets that could have been useful
# Ptychadena
# * http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190440#sec022
# 
# Peromyscus (Munshi-South)
# 
# Chinese sea bass (SRP094869)
# * Population Genomics Reveals Genetic Divergence and Adaptive Differentiation of Chinese Sea Bass (Lateolabrax maculatus)
# * https://link.springer.com/article/10.1007/s10126-017-9786-0#Sec2

# In[29]:


import ipyrad as ip
import ipyrad.analysis as ipa
import ipyparallel as ipp
import pandas as pd
import toyplot
import glob
import gzip


## Ptychadena (Stephane)
## * http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0190440#sec022

## Peromyscus (Munshi-South)

## Chinese sea bass (SRP094869)
## Population Genomics Reveals Genetic Divergence and Adaptive Differentiation of Chinese Sea Bass (Lateolabrax maculatus)
## * https://link.springer.com/article/10.1007/s10126-017-9786-0#Sec2


# In[ ]:


get_ipython().run_cell_magic('bash', '', 'conda install -c eaton-lab toytree\nconda install -c bioconda sra-tools entrez-direct\n')


# In[ ]:


get_ipython().run_cell_magic('bash', '', '## Fetch the elephant reference genome\nmkdir ref\nwget ftp://ftp.broadinstitute.org/pub/assemblies/mammals/elephant/loxAfr3/assembly_supers.fasta.gz -o ref/assembly_supers.fasta.gz\n')


# In[ ]:


get_ipython().run_cell_magic('bash', '', '\n## Data from "Single-nucleotide polymorphism discovery and panel characterization in the African forest elephant"\n## http://onlinelibrary.wiley.com/doi/10.1002/ece3.3854/full?wol1URL=/doi/10.1002/ece3.3854/full&regionCode=US-NY&identityKey=34b23ec9-4666-4c37-8470-8448e64d6167\nipyrad --download SRP126637 raws/\n')


# In[39]:


## R1 and R2 were concatenated in the SRA data files so we have to pull them apart.
## This is probably not the most efficient way to do this, but it works.
raws = glob.glob("raws/*")
print(raws)
for r in raws:
    name = r.split("/")[1].split("_")[0]
    lines = gzip.open(r).readlines()
    nlines = len(lines)/2
    print("{} {}".format(name, nlines))
    with gzip.open("raws/{}_R1_.fastq.gz".format(name), 'wb') as r1:
        r1.write("".join(lines[:nlines]))
    with gzip.open("raws/{}_R2_.fastq.gz".format(name), 'wb') as r2:
        r2.write("".join(lines[nlines:]))


# ```
# Fetching project data...
#            Run    spots mates      ScientificName SampleName
# 0   SRR6371502   241196     0  Loxodonta cyclotis    LOC0127
# 1   SRR6371503  1133408     0  Loxodonta cyclotis    LOC0122
# 2   SRR6371505  1002779     0  Loxodonta cyclotis    LOC0151
# 3   SRR6371506  1582988     0  Loxodonta cyclotis    LOC0274
# 4   SRR6371507  2519228     0  Loxodonta cyclotis    LOC0263
# 5   SRR6371508  1002140     0  Loxodonta cyclotis    LOC0309
# 6   SRR6371509  2894868     0  Loxodonta cyclotis    LOC0279
# 7   SRR6371510  1190860     0  Loxodonta cyclotis    LOC0311
# 8   SRR6371511   267664     0  Loxodonta cyclotis    LOC0310
# 9   SRR6371512   758060     0  Loxodonta cyclotis    LOC0040
# 10  SRR6371513  2176494     0  Loxodonta cyclotis    LOC0038
# 11  SRR6371514  2319874     0  Loxodonta cyclotis    LOC0037
# 12  SRR6371515   906060     0  Loxodonta cyclotis    LOC0035
# 13  SRR6371516  1133648     0  Loxodonta cyclotis    LOC0051
# 14  SRR6371517  1816948     0  Loxodonta cyclotis    LOC0050
# 15  SRR6371518  1471242     0  Loxodonta cyclotis    LOC0049
# 16  SRR6371519  1366528     0  Loxodonta cyclotis    LOC0041
# 17  SRR6371520   225068     0  Loxodonta cyclotis    LOC0121
# 18  SRR6371521  1266382     0  Loxodonta cyclotis    LOC0088
# ```

# In[37]:


df = pd.DataFrame([x.split("\t") for x in """SRR6371521	SAMN08167496	LOC0088	WG011	178	95	SRX3466825	LOC0088_a	Lope
SRR6371520	SAMN08167497	LOC0121	WG012	31	16	SRX3466826	LOC0121_a	Minkebe
SRR6371519	SAMN08167492	LOC0041	WG005	192	102	SRX3466827	LOC0041_a	Waka
SRR6371518	SAMN08167493	LOC0049	WG008	207	110	SRX3466828	LOC0049_a	Ivindo
SRR6371517	SAMN08167494	LOC0050	WG009	256	135	SRX3466829	LOC0050_b	Ivindo
SRR6371516	SAMN08167495	LOC0051	WG010	160	85	SRX3466830	LOC0051_a	Ivindo
SRR6371515	SAMN08167488	LOC0035	WG001	127	69	SRX3466831	LOC0035_a	Minkebe
SRR6371514	SAMN08167489	LOC0037	WG002	327	175	SRX3466832	LOC0037_a	Lope
SRR6371513	SAMN08167490	LOC0038	WG003	307	164	SRX3466833	LOC0038_a	Lope
SRR6371512	SAMN08167491	LOC0040	WG004	106	56	SRX3466834	LOC0040_a	Wonga Wongue
SRR6371511	SAMN08167506	LOC0310	WG023	37	20	SRX3466835	LOC0310_a	Moukalaba Doudou
SRR6371510	SAMN08167507	LOC0311	WG024	168	89	SRX3466836	LOC0311_a	Monts de Cristal
SRR6371509	SAMN08167504	LOC0279	WG020	408	215	SRX3466837	LOC0279_b	South Mulundu
SRR6371508	SAMN08167505	LOC0309	WG022	141	75	SRX3466838	LOC0309_a	Mayumba
SRR6371507	SAMN08167502	LOC0263	WG018	355	188	SRX3466839	LOC0263_a	Wonga Wongue
SRR6371506	SAMN08167503	LOC0274	WG019	223	117	SRX3466840	LOC0274_a	Loango
SRR6371505	SAMN08167500	LOC0151	WG015	141	71	SRX3466841	LOC0151_a	Moukalaba Doudou
SRR6371503	SAMN08167498	LOC0122	WG013	159	85	SRX3466843	LOC0122_a	Minkebe
SRR6371502	SAMN08167499	LOC0127	WG014	34	18	SRX3466844	LOC0127_a	Moukalaba Doudou""".split("\n")])

## Just get the sequence identifier, the sample name, and the sample location
df = df.iloc[:, [0,7,8]]


# Launch the cluster by running this command on the command line:
# 
#     ipcluster start --n=19 --daemonize

# In[5]:


## connect to cluster and check if all the engines are running
ipyclient = ipp.Client()
ipyclient.ids


# In[19]:


data = ip.Assembly("Loxodonta")


# In[37]:


## set parameters
data.set_params("project_dir", "ddrad-denovo")
data.set_params("sorted_fastq_path", "raws/*_.fastq")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CATGC"))
data.set_params("clust_threshold", "0.90")
data.set_params("filter_adapters", "2")
data.set_params("max_Hs_consens", (5, 5))
data.set_params("trim_loci", (0, 0, 0, 0))
data.set_params("output_formats", "*")

## see/print all parameters
data.get_params()
data.write_params(force=True)


# In[40]:


data.run("12")


# In[41]:


## access the stats of the assembly (so far) from the .stats attribute
data.stats


# In[42]:


## run steps 3-6 of the assembly
data.run("34567")


# In[7]:


kvalues = [2, 3, 4, 5, 6]
s = ipa.structure(
    name="quick",
    workdir="./analysis-structure",
    data="./ddrad-denovo/Loxodonta_outfiles/Loxodonta.ustr",
    )

## set main params (use much larger values in a real analysis)
s.mainparams.burnin = 1000
s.mainparams.numreps = 5000

## submit N replicates of each test to run on parallel client
for kpop in kvalues:
    s.run(kpop=kpop, nreps=4, ipyclient=ipyclient)

## wait for parallel jobs to finish
ipyclient.wait()


# In[8]:


## return the evanno table (deltaK) for best K 
etable = s.get_evanno_table(kvalues)
etable


# In[17]:


## set some clumpp params
s.clumppparams.m = 3               ## use largegreedy algorithm
s.clumppparams.greedy_option = 2   ## test nrepeat possible orders
s.clumppparams.repeats = 10000     ## number of repeats
s.clumppparams
## run clumpp for each value of K
#tables = s.get_clumpp_table(kvalues, quiet=True)
print(tables)
#table = tables[4].sort_values(by=[0, 1, 2, 3])
table = tables[2].sort_values(by=[0, 1])
toyplot.bars(
    table,
    width=500, 
    height=200,
    title=[[i] for i in table.index.tolist()],
    xshow=False,
);
print(table)