#!/usr/bin/env python
# coding: utf-8

# The problem: You've just completed an amplicon sequencing run on the 454 instrument, but because the sequences are longer than any you've previously generated on 454, you suspect that you may have sequenced through your reverse primers and into non-biological sequence (e.g., sequencing adapters).
# 
# The solution: You want to find your reverse PCR primer in each of the sequences, and remove that and all bases following it. To do this, you can use scikit-bio's [global nucleotide aligner](http://scikit-bio.org/docs/0.5.0/generated/skbio.alignment.global_pairwise_align_nucleotide.html). 

# In[1]:


import numpy as np
import random

from skbio.alignment import global_pairwise_align_nucleotide
from skbio import DNA


# First, we'll model some sequences. This is quick-and-dirty. Each sequence will contain some biological sequence (which is what we actually care about) with a mean/std length of 400/40, followed by one of four slightly different reverse primers (so representing a primer with 4-fold degeneracy), followed by some non-biological sequence with a mean/std length of 25/2. This is a reasonable representation of what we'd get off of the sequencing instrument: our reverse primer is somewhere in the sequence, but we don't know the exact start or end positions. *(Note that I'm not modeling any sort of sequencing error here, and the biological sequence is random, which is not representative of what we'd have in an amplicon sequencing run.)*
# 
# Follow the inline comments for descriptions of each step.

# In[2]:


sequences = []
num_sequences = 50
mean_biological_sequence_length = 400
std_biological_sequence_length = 40
mean_nonbiological_sequence_length = 25
std_nonbiological_sequence_length = 2
# imagine that we have four slightly different reverse primers
reverse_primers = [DNA("ACCGTCGACCGTTAGGATA"),
                   DNA("ACCGTGGACCGTGAGGATT"),
                   DNA("ACCGTCGACCGTTAGGATT"),
                   DNA("ACCGTGGACCGTGAGGATG")]

for i in range(num_sequences):
    # determine the length for the current biological sequence. if it's less than 1, make the length 0
    biological_sequence_length = int(np.random.normal(mean_biological_sequence_length,
                                                      std_biological_sequence_length))
    if biological_sequence_length < 1:
        biological_sequence_length = 0
    # generate a random sequence of that length
    biological_sequence = ''.join(np.random.choice(list('ACGT'),biological_sequence_length))
    
    
    # determine the length for the current non-biological sequence. if it's less than 1, make the length 0
    non_biological_sequence_length = int(np.random.normal(mean_nonbiological_sequence_length,
                                                          std_nonbiological_sequence_length))
    if non_biological_sequence_length < 1:
        non_biological_sequence_length = 0
    # generate a random sequence of that length
    non_biological_sequence = ''.join(np.random.choice(list('ACGT'), non_biological_sequence_length))

    # choose one of the four reverse primers at random
    reverse_primer = random.choice(reverse_primers)
    
    # construct the observed sequence as the biological sequence, followed by the primer, followed by the 
    # non-biological sequence
    observed_sequence = ''.join(map(str, [biological_sequence, reverse_primer, non_biological_sequence]))
    seq_id = "seq%d" % i
    
    # append the result to the sequences list
    sequences.append(DNA(observed_sequence, metadata={'id': seq_id}))


# In[3]:


print(repr(sequences[0]))


# Now to get to the problem at hand. How do we find the primer sequence in random sequence. The answer is with global alignment. If we align the first reverse primer to the first sequence, we can get a [``TabularMSA``](http://scikit-bio.org/docs/0.5.0/generated/skbio.alignment.TabularMSA) object back. 
# 
# Notice that in this step we get an ``EfficencyWarning``. That's because scikit-bio currently only has a python implementation of global alignment, which is slow because it's a computationally complex algorithm. In the future, we'll have a C-based implementation which will be much faster.

# In[4]:


aln = global_pairwise_align_nucleotide(reverse_primers[0], sequences[0])[0]


# We next want to find the start position of the primer sequence in the sequencing product, which we can do using the ``gaps`` boolean vector of the first sequence in the alignment (to learn about ``gap_vector``). The following tells us where the first non-gap character in the primer alignment is, which is the position in the sequencing product where the primer match begins.

# In[5]:


gap_vector = aln[0].gaps()
primer_start_index = (~gap_vector).nonzero()[0][0]
print(primer_start_index)


# So, we can slice the original sequence through that position, and the result will be our sequencing product minus the reverse primer and the non-biological sequence.

# In[6]:


print(sequences[0][:primer_start_index])


# Finally, if we want to do this for all of the sequences, we can embed the above steps in a loop over the ``DNA`` sequences. 

# In[7]:


trimmed_sequences = []

for sequence in sequences:
    aln = global_pairwise_align_nucleotide(reverse_primers[0], sequence)[0]
    gap_vector = aln[0].gaps()
    primer_start_index = (~gap_vector).nonzero()[0][0]
    trimmed_sequences.append(DNA(sequence[:primer_start_index], metadata={'id': sequence.metadata['id']}))


# We can then print the result, and we'll have acheived our goal.

# In[8]:


import skbio

print("".join(skbio.io.write((s for s in trimmed_sequences), into=[], format='fasta')))