#!/usr/bin/env python
# coding: utf-8

# In[1]:


import ipyrad as ip
import ipyparallel as ipp 


# In[2]:


from ipyrad.assemble.clustmap import *
from ipyrad.assemble.consens_se import *


# In[3]:


ipyclient = ipp.Client()


# In[4]:


data = ip.Assembly("test")
data.set_params("project_dir", "hotfix")
data.set_params("raw_fastq_path", "ipsimdata/pairddrad_example_R*_.fastq.gz")
data.set_params("barcodes_path", "ipsimdata/pairddrad_example_barcodes.txt")
data.set_params("reference_sequence", "ipsimdata/pairddrad_example_genome.fa")
data.set_params("assembly_method", "reference")
data.set_params("datatype", "pairddrad")
data.set_params("restriction_overhang", ("TGCAG", "CCG"))    
data.get_params()


# In[5]:


data.run("12", force=True)


# In[7]:


data.run("3")


# In[8]:


data.run('4')


# In[9]:


data.run("5")


# In[11]:


data.run("6")


# In[51]:


from ipyrad.assemble.write_outputs import *
bself = Step7(data, True, ipyclient)
self.split_clusters()

jobs = glob.glob(os.path.join(self.data.tmpdir, "chunk-*"))
jobs = sorted(jobs, key=lambda x: int(x.rsplit("-")[-1]))        
for jobfile in jobs:
    args = (self.data, self.chunksize, jobfile)


# In[119]:


def get_edges(self, seqs):
    """
    Trim terminal edges or mask internal edges based on three criteria and
    take the max for each edge.
    1. user entered hard trimming.
    2. removing cutsite overhangs.
    3. trimming singleton-like overhangs from seqs of diff lengths.
    """
    # record whether to filter this locus based on sample coverage
    bad = False

    # 1. hard trim edges
    trim1 = np.array(self.data.paramsdict["trim_loci"])

    # 2. fuzzy match for trimming restriction site where it's expected.
    trim2 = np.array([0, 0, 0, 0])
    overhangs = np.array([
        i.encode() for i in self.data.paramsdict["restriction_overhang"]
        ])
    for pos, overhang in enumerate(overhangs):
        if overhang:
            cutter = np.array(list(overhang))
            trim2[pos] = check_cutter(seqs, pos, cutter, 0.75)

    # 3. find where the edge is not indel marked (really unknown ("N"))
    trim3 = np.array([0, 0, 0, 0])
    try:            
        minsamp = min(4, seqs.shape[0])
        # minsamp = max(minsamp, self.data.paramsdict["min_samples_locus"])
        mincovs = np.sum((seqs != 78) & (seqs != 45), axis=0)
        for pos in range(4):
            trim3[pos] = check_minsamp(seqs, pos, minsamp, mincovs)
    except ValueError:
        print('error')
        bad = True

    # get max edges
    print(trim1, trim2, trim3)
    trim = np.max([trim1, trim2, trim3], axis=0)

    # return edges as slice indices
    r1left = trim[0]
    
    # single-end simple:
    if "pair" not in self.data.paramsdict["datatype"]:
        r1right = seqs.shape[1] - trim[1]
        r2left = r2right = r1right
        edges = (r1left, r1right, r2left, r2right)
        
    else:
        r1right = 
        
        
    # get filter
    if (r1right < r1left) or (r2left < r1right) or (r2right < r2left):
        bad = True

    # if loc length is too short then filter
    if (r2right - r1left) < self.data.paramsdict["filter_min_trim_len"]:
        bad = True

    return bad, edges


# In[120]:


def check_minsamp(seq, position, minsamp, mincovs):
    "used in Processor.get_edges() for trimming edges of - or N sites."
    
    if position == 0:           
        # find leftmost edge with minsamp coverage
        leftmost = np.where(mincovs >= minsamp)[0]
        if leftmost.size:
            return leftmost.min()
        
        # no sites actually have minsamp coverage although there are minsamp
        # rows of data, this can happen when reads only partially overlap. Loc
        # should be excluded for minsamp filter.
        else:
            raise ValueError("no sites above minsamp coverage in edge trim")
    
    elif position == 1:
        maxhit = np.where(mincovs >= minsamp)[0].max()
        return seq.shape[1] - (maxhit + 1)

    ## TODO...    
    elif position == 2:
        return 0
    
    else:
        return 0


# In[121]:


# store list of edge trims for VCF building
edgelist = []

# todo: this could be an iterator...
with open(self.chunkfile, 'rb') as infile:
    loci = infile.read().split(b"//\n//\n")

    # iterate over loci
    for iloc, loc in enumerate(loci):                              
        # load names and seqs 
        lines = loc.decode().strip().split("\n")
        names = []
        nidxs = []
        aseqs = []
        useqs = []
        for line in lines:
            if line[0] == ">":
                name, nidx = line[1:].rsplit("_", 1)
                names.append(name)
                nidxs.append(nidx)
            else:
                aseqs.append(list(line))
                useqs.append(list(line.upper()))

        # filter to include only samples in this assembly
        mask = [i in self.data.snames for i in names]
        names = np.array(names)[mask].tolist()
        nidxs = np.array(nidxs)[mask].tolist()
        useqs = np.array(useqs)[mask, :].astype(bytes).view(np.uint8)
        aseqs = np.array(aseqs)[mask, :].astype(bytes).view(np.uint8)

        # apply filters
        efilter, edges = get_edges(self, useqs)
        print(efilter, edges)


# In[54]:


data = self.data
chunksize = self.chunksize
chunkfile = jobs[0]

self = Processor(data, chunksize, chunkfile)


# In[23]:


self.remote_process_chunks()
self.collect_stats()
self.store_file_handles()


# In[24]:


start = time.time()
printstr = ("building arrays     ", "s7")
rasyncs = {}
args0 = (self.data,)
args1 = (self.data, self.ntaxa, self.nbases, self.nloci)
args2 = (self.data, self.ntaxa, self.nsnps)

write_loci_and_alleles(*args0)


# In[25]:


fill_seq_array(*args1)


# In[49]:


data = self.data
ntaxa = self.ntaxa
nsnps = self.nsnps

# get faidict to convert chroms to ints
if data.isref:
    faidict = chroms2ints(data, True)

# open new database file handle
with h5py.File(data.snps_database, 'w') as io5:

    # Database files for storing arrays on disk. 
    # Should optimize for slicing by rows if we run into slow writing, or 
    # it uses too much mem. For now letting h5py to auto-chunking.
    io5.create_dataset(
        name="snps",
        shape=(ntaxa, nsnps),
        dtype=np.uint8,
    )
    # store snp locations:
    # (loc-counter, loc-snp-counter, loc-snp-pos, chrom, chrom-snp-pos)
    io5.create_dataset(
        name="snpsmap",
        shape=(nsnps, 5),
        dtype=np.uint32,
    )
    # store snp locations
    io5.create_dataset(
        name="pseudoref",
        shape=(nsnps, 4),
        dtype=np.uint8,
    )
    # store genotype calls (0/0, 0/1, 0/2, etc.)
    io5.create_dataset(
        name="genos",
        shape=(nsnps, ntaxa, 2),
        dtype=np.uint8,
    )

    # gather all loci bits
    locibits = glob.glob(os.path.join(data.tmpdir, "*.loci"))
    sortbits = sorted(locibits, 
        key=lambda x: int(x.rsplit("-", 1)[-1][:-5]))

    # name order for entry in array
    sidxs = {sample: i for (i, sample) in enumerate(data.snames)}

    # iterate through file
    start = end = 0
    tmploc = {}
    locidx = 1
    snpidx = 1   
    
    # array to store until writing
    tmparr = np.zeros((ntaxa, nsnps), dtype=np.uint8)
    tmpmap = np.zeros((nsnps, 5), dtype=np.uint32)

    # iterate over chunkfiles
    for bit in sortbits:
        # iterate lines of file until locus endings
        for line in iter(open(bit, 'r')):

            # still filling locus until |\n
            if "|\n" not in line:
                name, seq = line.split()
                tmploc[name] = seq

            # locus is full, dump it
            else:
                # convert seqs to an array
                loc = (
                    np.array([list(i) for i in tmploc.values()])
                    .astype(bytes).view(np.uint8)
                    )
                snps, idxs, _ = line[len(data.snppad):].rsplit("|", 2)
                snpsmask = np.array(list(snps)) != " "
                snpsidx = np.where(snpsmask)[0]

                # select only the SNP sites
                snpsites = loc[:, snpsmask]

                # store end position of locus for map
                end = start + snpsites.shape[1]
                print(start, end)
                
                for idx, name in enumerate(tmploc):
                    tmparr[sidxs[name], start:end] = snpsites[idx, :]

                # store snpsmap data 1-indexed with chroms info
                if data.isref:
                    chrom, pos = idxs.split(",")[0].split(":")
                    start = int(pos.split("-")[0])
                    #chromidx = faidict[chrom]
                    chromidx = int(chrom)
                    for isnp in range(snpsites.shape[1]):
                        isnpx = snpsidx[isnp]
                        tmpmap[snpidx - 1] = (
                            locidx, isnp, isnpx, chromidx, isnpx + start,
                        )
                        snpidx += 1
                # store snpsmap data (snpidx is 1-indexed)
                else:
                    for isnp in range(snpsites.shape[1]):
                        tmpmap[snpidx - 1] = (
                            locidx, isnp, snpsidx[isnp], 0, snpidx,
                        )
                        snpidx += 1
                locidx += 1

                # reset locus
                start = end
                tmploc = {}

    # fill missing with 78 (N)
    tmparr[tmparr == 0] = 78


# In[48]:


print(tmparr.shape)


print(start, end)
tmparr[sidxs[name], start:end]


# In[21]:


fill_snp_array(*args2)


# In[12]:


data.run("7")


# In[ ]:


# In[ ]:


# In[5]:


data.run("3")


# In[6]:


data.run("456")


# In[11]:


data.run("1", force=True)


# In[9]:


data = ip.load_json("tortas/5-tortas.json")


# In[18]:


d2 = data.branch("d2", subsamples=["AGO09concat"])


# In[19]:


d2.run("3", force=True)


# In[20]:


d2.run("45", force=True)


# In[7]:


d2 = ip.load_json("./tortas/d2.json")


# In[9]:


d2.stats


# In[11]:


self = Step5(data, True, ipyclient)


# In[12]:


self.remote_calculate_depths()


# In[13]:


self.remote_make_chunks()


# In[14]:


statsdicts = self.remote_process_chunks()


# In[15]:


statsdicts


# In[ ]:


self.remote_concatenate_chunks()


# In[ ]:


self.data_store(statsdicts)


# In[10]:


data.stats


# In[5]:


dd = data.branch("dd")
dd.run("5", force=True)


# In[6]:


dd.run("3", force=True)


# In[4]:


data.run("5", force=True)


# In[5]:


step = Step6(data, True, ipyclient)


# In[6]:


step.remote_concat_bams()


# In[7]:


step.remote_build_ref_regions()


# In[8]:


self = step


# In[9]:


regs = self.regions[:20]
regs


# In[11]:


# access reads from bam file using pysam
bamfile = AlignmentFile(
    os.path.join(
        self.data.dirs.across,
        "cat.sorted.bam"),
    'rb')

# catcons output file for raw clusters and db samples
outfile = gzip.open(
    os.path.join(
        self.data.dirs.across,
        "{}.catcons.gz".format(self.data.name)),
    'wb')

# write header line to catcons with sample names
snames = sorted([i.name for i in self.samples])
nsamples = len(snames)
outfile.write(
    b" ".join([i.encode() for i in snames]) + b"\n")

# get clusters
lidx = 0
clusts = []
# while 1:

#     try:
#         region = next(self.regions)
#         reads = bamfile.fetch(*region)
#     except StopIteration:
#         break

for region in regs:
    reads = bamfile.fetch(*region)
 
    # get reference
    print(region)
    refn, refs = get_ref_region(
        data.paramsdict["reference_sequence"], 
        region[0], region[1]+1, region[2]+1)  
    
    # build cluster dict for sorting                
    rdict = {}
    for read in reads:
        rdict[read.qname] = read.seq   
    keys = sorted(rdict.keys(), key=lambda x: x.rsplit(":", 2)[0])
    
    # build cluster based on map positions (reads are already oriented)
    arr = np.zeros((nsamples + 1, len(refs)), dtype=bytes)
    
    # fill it
    arr[0] = list(refs)
    for idx, key in enumerate(keys):
        # get start and stop from this seq
        sname = key.rsplit("_", 1)[0]
        rstart, rstop = key.rsplit(":", 2)[-1].split("-")
        sidx = snames.index(sname)
        
        # get start relative to ref
        start = int(rstart) - int(region[1]) - 1
        stop = start + int(rstop) - int(rstart)
        print(sidx + 1, start, stop, arr.shape[1])
        arr[sidx + 1, int(start): int(stop)] = list(rdict[key])
        
    print("")
    arr[arr == b""] = b"N"
    for line in arr:
        outfile.write(line.tostring() + b"\n")
    outfile.write(b"\n")
    
        
outfile.close()


# In[50]:


print(start, stop, stop-start, len(rdict[key]), rstart, rstop, int(rstop) - int(rstart))
print(region, region[2] - region[1], len(refs))


# In[61]:


key.split("")


# In[62]:


snames


# In[141]:


revcomp("AATTCCATTCTTCCTTTCCCATACCTCCCGCCCTGCTCCTTTCCKCTCTTGATTTCTTCTTGAGGGAGGCAGAGGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGGCACCAGATTTTCTCACTGTTCAGGTCAGGGTTTGACTTCAGCCCCATCTCTAATACAAGCCATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN")


# In[37]:


# get consens seq and variant site index 
clust = []
avars = refvars(arr.view(np.uint8), PSEUDO_REF)
dat = b"".join(avars.view("S1")[:, 0]).decode()
snpstring = "".join(["*" if i else " " for i in avars[:, 1]])
clust.append("ref_{}:{}-{}\n{}".format(*region, dat))

# or, just build variant string (or don't...)
# write all loci with [locids, nsnps, npis, nindels, ?]
for key in keys:
    clust.append("{}\n{}".format(key, rdict[key]))
clust.append("SNPs\n" + snpstring)
clusts.append("\n".join(clust))

# advance locus counter
lidx += 1

# write chunks to file
if not lidx % 1000:
    outfile.write(
        str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
    clusts = []

# write remaining
if clusts:                
outfile.write(
    str.encode("\n//\n//\n".join(clusts) + "\n//\n//\n"))
outfile.close()


# In[16]:


step.remote_build_ref_clusters()


# In[4]:


from ipyrad.assemble.clustmap import *


# In[6]:


self = Step3(data, 8, True, ipyclient)


# In[7]:


self.run()


# In[8]:


self.data.run("45")


# In[13]:


self.remote_index_refs()
self.remote_run(
    function=concat_multiple_edits,
    printstr=("concatenating       ", "s3"),
    args=(),
)
self.remote_run(
    function=merge_end_to_end,
    printstr=("join unmerged pairs ", "s3"),
    args=(False, False,),
)


# In[15]:


self.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)


# In[16]:


self.remote_run(
    function=split_endtoend_reads,
    printstr=("splitting dereps    ", "s3"),
    args=(),
)


# In[ ]:


self.remote_run(
    function=mapping_reads,
    printstr=("mapping reads       ", "s3"),
    args=(self.nthreads,),
    threaded=True,
)


# In[ ]:


sample = list(self.data.samples.values())[0]
merge_end_to_end(self.data, sample, True, True)


# In[ ]:


# In[15]:


sample = list(data.samples.values())[0]

infiles = [
    os.path.join(data.dirs.edits, "{}.trimmed_R1_.fastq.gz".format(sample.name)),
    os.path.join(data.dirs.edits, "{}_R1_concatedit.fq.gz".format(sample.name)),
    os.path.join(data.tmpdir, "{}_merged.fastq".format(sample.name)),
    os.path.join(data.tmpdir, "{}_declone.fastq".format(sample.name)),
]
infiles = [i for i in infiles if os.path.exists(i)]
infile = infiles[-1]

infile


# In[19]:


strand = "plus"
if data.paramsdict["datatype"] is ('gbs' or '2brad'):
    strand = "both"
nthreads=2

cmd = [
    ip.bins.vsearch,
    "--derep_fulllength", infile,
    "--strand", strand,
    "--output", os.path.join(data.tmpdir, sample.name + "_derep.fastq"),
    "--threads", str(nthreads),
    "--fasta_width", str(0),
    "--fastq_qmax", "1000",
    "--sizeout", 
    "--relabel_md5",
]
cmd


# In[20]:


proc = sps.Popen(cmd, stderr=sps.STDOUT, stdout=sps.PIPE, close_fds=True)
errmsg = proc.communicate()[0]
if proc.returncode:
    ip.logger.error("error inside dereplicate %s", errmsg)
    raise IPyradWarningExit(errmsg)


# In[10]:


s.remote_run(
    function=dereplicate,
    printstr=("dereplicating       ", "s3"),
    args=(s.nthreads,),
    threaded=True,
)