#!/usr/bin/env python
# coding: utf-8

# # COMP 204 Lecture 27
# # Handling Sequence with BioPython
# ## Yue Li
# #### based on materials from Mathieu Blanchette

# Edit directly on the notebook

# In[1]:


x=2
print(x)
print("Hello world")


# # BioPython
# 
# BioPython is a very popular third party Python package for handling biological data.
# 
# In order to install:
# 
# ```
# conda install biopython
# ```
# 
# BioPython has three major functionalities:
# 
# * Sequence Handling
# * 3D Structure
# * Population Genetics
# 
# We'll focus on sequence handling in this lecture.
# 
# Some useful references: [tutorial](http://biopython.org/DIST/docs/tutorial/Tutorial.html), [website](http://biopython.org/), [wiki](http://biopython.org/wiki/Category%3AWiki_Documentation) (most of my examples are coming from these sources).
#  
# # `BioPython.Seq`
# 
# The main object we'll be dealing with are sequences. This is handled by the `Seq` class.
# 
# As we know, we can have DNA, RNA, and Protein sequences.
# 
# BioPython helps us cleanly distinguish and do different things with different kinds of sequences.

# In[2]:


#let's make a generic sequence

from Bio.Seq import Seq # Seq is a class defined in Bio.Seq file

my_seq = Seq("CCCGGAGAGA")
print(type(my_seq))
#let's see what attributes this object has
print(my_seq)


# It seems like there is a lot we can do with this object.
# 
# The problem is at this point Python doesn't know what kind of sequence this is (DNA, RNA, Protein).
# 
# We have to specify what kind of "Alphabet" the sequence belongs to.

# In[3]:


#right now it has just a generic alphabet
print(my_seq.alphabet)


# In[4]:


from Bio.Alphabet import generic_dna, generic_protein, generic_rna

my_dna = Seq("CCCGGAGAG", generic_dna)
my_rna = Seq("ACCCGUUGU", generic_rna)
my_protein = Seq("AKKKGGGUUULL", generic_protein)

print(my_dna.alphabet)
print(my_rna.alphabet)
print(my_protein.alphabet)


# Biopython will now know the difference for example between a DNA base `A` for adenine and the protein residue `A` for alanine.
# 
# Now we can do some cool things.
# 
# We can perform the main actions of the central dogma: transcribe and translate.

# In[5]:


my_gene = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", generic_dna)

#get the mRNA
my_transcript = my_gene.transcribe()
print(my_transcript)

#get the protein from the mRNA
my_protein = my_transcript.translate()
print(my_protein)


# As you can see, we got some STOP codons represented as `*` and translation continued.
# 
# We can get translation to actually stop when it encounters a STOP codon.

# In[6]:


myprot = my_transcript.translate(to_stop=True)
print(myprot)


# ### General sequence methods
# 
# There are some convenient operations for dealing with sequences.

# We can concatenate sequences if they are of matching type.

# In[7]:


seq1 = Seq("AAACGGA", generic_dna)
seq2 = Seq("GGAGAT", generic_dna)

print(seq1 + seq2)


# We can also index and slice as though we had strings.

# In[8]:


print(seq1[:2] + seq2[-1])


# `Seq` objects are immutable, just like strings.

# In[21]:


seq1[2] = "G"


# There is another type of object called a `MutableSeq`. If we want to support mutability we can convert a `Seq` object to a `MutableSeq` object quite easily.

# In[22]:


mut_seq = seq1.tomutable()
mut_seq


# In[23]:


mut_seq[0] = "G"
print(mut_seq)


# We can also do searching inside sequences.

# In[24]:


myseq = Seq("CCAGAAACCCGGAA", generic_dna)

#find the first occurence of the pattern
print(myseq.find("GAA"))

#find the number of non-overlapping occurrences of a pattern
print(myseq.count("GAA"))


# ## Class Inheritance from Existing Class
# 
# We can write new classes that extend any existing class, including those defined in BioPython.
# 
# Example: Define the MySeq class that extends the Seq class to add:
# 
# 1. a list of confidence values (between 0 and 1) associated to each character in the sequence
# 2. an average_confidence() method that computes the average confidence values for the sequence
# 3. a gc_content() method that computes the fraction of bases that are either G or C

# In[89]:


from Bio.Seq import Seq

class MySeq(Seq):
    def __init__(self, seq, conf):
        Seq.__init__(self, seq)
        self.confidence = conf  # confidence values

    # Seq doesn't compute GC content so 
    # we'll add that functionality
    def gc_content(self):
        return len([b for b in self if b in "GC"]) / len(self)
    
    def avg_confidence(self):
        return sum(self.confidence)/len(self.confidence)

seq1 = Seq("ACGTATG")
seq2 = MySeq("AAACG",[0.9, 0.8, 0.5, 1, 0.8]) 
print("GC content = ",seq2.gc_content())
print("Average confidence value = ", seq2.avg_confidence())


# # Sequence alignment
# 
# Say you isolated some piece of DNA in the lab and obtained its sequence.
# 
# You now want to find out which organism that sequence belongs to.
# 
# You ask a biologist and they tell you to just "BLAST it". (recalled Lecture 14 miniBlast program)
# 
# As you may know already, this is essentially the same as "Google it" for biologists.
# 
# BLAST is an alignment algorithm that searches for your sequence of interest in a huge database of sequences whose origins are known.
# 
# If you didn't know BioPython, you would take your sequence `CTCTCCCAGCTGAGCTATAGCCCCAATGCGCACATAATAAATCGTGTGAACGGGGCGCATGATATGAGAC` 
# and go to the [NCBI BLAST web server](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) and click on buttons like a monkey.
# 
# Thanks to BioPython we can do this programatically!
# 
# The `qblast` ('q' stands for query) method from the `Bio.Blast.NCBIWWW` module essentially sends our sequence to the BLAST web server.
# 
# Here we are using the "nucleotice BLAST" algorithm so we say `blastn` and we are using it on the database of all nucleotide sequences, called `nt`.
# 

# In[72]:


from Bio.Blast import NCBIWWW
s = Seq("CTCTCCCAGCTGAGCTATAGCCCCAATGCGCACATAATAAATCGTGTGAACGGGGCGCATGATATGAGAC", generic_dna)
result_handle = NCBIWWW.qblast("blastn", "nt", s)


# We wait a few seconds and get a `result_handle` which is like a temporary open file that we can read from.
# 
# The format of this file is in XML so not easy to read, thankfully BioPython has an XML parser that extracts all the information for us.

# In[73]:


from Bio.Blast import NCBIXML
blast_records = NCBIXML.parse(result_handle)


# We get an "iterator" of BLAST record objects, or "search results". We can now loop over each of our search results and print some information.
# 
# Here I am looping over all the results which each have an attribute `alignments` which are the alignments of our query sequence to some organism in the database.
# 
# The `alignment` attribute itself has other attributes like the `query` sequence, the `length` and `title` of the matching organism..

# In[74]:


for b in blast_records:
    for alignment in b.alignments:
        for hsp in alignment.hsps:
            print('****Alignment****')
            print('sequence:', alignment.title)
            print('length:', alignment.length)
            print('e value:', hsp.expect)
            print(hsp.query[0:75] + '...')
            print(hsp.match[0:75] + '...')
            print(hsp.sbjct[0:75] + '...')


# # `SeqRecord` and `SeqIO`
# 
# Often, sequences have some additional information associated to them. 
# 
# A good example was the BLAST exercise which gave us sequences associated to specific organisms, and genomic locations.
# 
# Ideally we would want to be able to keep this information along with our `Seq` if we have it.
# 
# BioPython lets us do this with the `SeqRecord` and `SeqIO` classes.
# 
# 
# ## Parsing with `SeqIO`
# 
# The `SeqIO` class which stands for Sequence Input/Output lets us read and write from various sequence annotation file formats which are common in biological databases.
# 
# I went to the [GenBank](https://www.ncbi.nlm.nih.gov/genbank/) sequence database and looked for some sequence related to the Bubonic Plague (Yersinia Pestis bacteria) [here](https://www.ncbi.nlm.nih.gov/nuccore/NZ_ADRZ01000932.1?report=fasta).
# 
# 
# 
# We get a `fasta` file which is very common for sequence annotations so BioPython can parse it for us automatically.
# 
# The annotation for a FASTA sequence is typically held in the header:
# 
# ```
# >NZ_ADRZ01000932.1 Yersinia pestis biovar Antiqua str. E1979001 Contig_E1979001_19275, whole genome shotgun sequence
# ```
# 
# The `SeqIO.parse` takes a path to a file and a format, in this case "fasta" and produces an iteratror over each entry in the fasta file.
# 
# Each item produced by the iterator is a `SeqRecords` object.
# 

# In[75]:


from Bio import SeqIO

record = SeqIO.read("plague.fa", "fasta")
print(record)


# The `SeqRecords` object has some useful attributes.
# 
# The object contains a `Seq` object witih the sequence info.

# In[76]:


print(record.seq)


# In[77]:


print(record)


# In[78]:


print(record.id)


# In[79]:


print(record.description)


# We can also create our own `SeqRecord` objects.

# In[80]:


from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
                       IUPAC.protein),
                   id="YP_025292.1", name="HokC",
                   description="toxic membrane protein, small")
print(record)


# Then we can write it to FASTA format.

# In[81]:


record.format("fasta")


# And then store it in a file. 
# 
# So now we know how to read sequence information, as well as produce our own sequence records and store them.

# In[82]:


f = open("myfasta.fa", "w")
f.write(record.format("fasta"))
f.close()    


# ## Making Multiple Sequence Alignments and Phylogenetic Trees
# 
# Another thing biologists like to do is align (essentially, compare) sequences.
# 
# We have seen multiple sequence alignment problem in our Assignment 2.
# 
# Basically, we first align any pair of sequences and evaluate how they differ from each other based on the similarity score (i.e., similarity score = match scores + gap penalties + mismatch panalties).
# 
# The similarity scores imply sequence phylogeny. 
# 
# We first compute a tree derived from the similarity between species in terms of their given sequences).
# 
# We then predict how evolution took place to explain a particular set of observed sequences.
# 
# 
# 
# ### Step 1: Obtain some sequences and align them
# 
# Use the fasta file p53.fa found here: https://www.cs.mcgill.ca/~yueli/teaching/COMP204_Winter2019/Lectures/27/p53.fa
# and use the ClustalW tool https://www.ebi.ac.uk/Tools/msa/clustalw2/
# to produce a multiple sequence alignment. The result is here: https://www.cs.mcgill.ca/~yueli/teaching/COMP204_Winter2019/Lectures/27/p53.aln

# First install ClustalW by the following commandline in the Anaconda Prompt
# 
# ```
# conda install -c bioconda clustalw
# ```

# 
# ### Step 2: Align the sequences

# In[83]:


from Bio.Align.Applications import ClustalwCommandline
in_file = "p53.fa"
out_file = "p53.aln"
cline = ClustalwCommandline("clustalw", infile=in_file, outfile=out_file)
stdout, stderr = cline()
print(stdout)


# Now that we have `SeqRecord` objects we can align them.
# 
# To do so, we need to put them in a `fasta` file.
# 
# ### Step 3: Load the resulting alignment

# In[84]:


#open the alignmnent file
from Bio import AlignIO
aln = open("p53.aln", "r") #use AlignIO to read the alignment file in 'clustal' format
alignment = AlignIO.read(aln, "clustal")

print(alignment)


# In[85]:


#write our sequences to fasta format
with open("seqs.fasta", "w") as f:
    SeqIO.write(seq_records, f, "fasta")


# We can parse the alignment with the `AlignIO` module from BioPython.
# 
# I won't go into detail on this module but it has a lot of useful funcionality for dealing with sequence alignments.
# 

# ### Step 4: Use the alignment to obtain a "distance" between all pairs of sequences
# 
# Using the parsed alignment, we can get the distance (or difference) between all the sequences.
# 
# This just tells us, for each pair of sequences, how different they are.

# In[86]:


from Bio.Phylo.TreeConstruction import DistanceCalculator

#calculate the distance matrix
calculator = DistanceCalculator('identity')
#adds distance matrix to the calculator object and returns it
dm = calculator.get_distance(alignment)
print(dm)


# ### Step 5: Construct a tree from the set of distances
# 
# And finally, we can construct a phylogenetic tree from the pairwise distances between all the sequences.

# In[87]:


from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

#initialize a DistanceTreeConstructor object based on our distance calculator object
constructor = DistanceTreeConstructor(calculator)

#build the tree
upgma_tree = constructor.build_tree(alignment)


# And let's use the `Phylo` module to visualize the result!

# In[88]:


from Bio import Phylo
import pylab
#draw the tree
Phylo.draw(upgma_tree)


# In[ ]: