#!/usr/bin/env python # coding: utf-8 # # COMP 204 Lecture 27 # # Handling Sequence with BioPython # ## Yue Li # #### based on materials from Mathieu Blanchette # Edit directly on the notebook # In[1]: x=2 print(x) print("Hello world") # # BioPython # # BioPython is a very popular third party Python package for handling biological data. # # In order to install: # # ``` # conda install biopython # ``` # # BioPython has three major functionalities: # # * Sequence Handling # * 3D Structure # * Population Genetics # # We'll focus on sequence handling in this lecture. # # Some useful references: [tutorial](http://biopython.org/DIST/docs/tutorial/Tutorial.html), [website](http://biopython.org/), [wiki](http://biopython.org/wiki/Category%3AWiki_Documentation) (most of my examples are coming from these sources). # # # `BioPython.Seq` # # The main object we'll be dealing with are sequences. This is handled by the `Seq` class. # # As we know, we can have DNA, RNA, and Protein sequences. # # BioPython helps us cleanly distinguish and do different things with different kinds of sequences. # In[2]: #let's make a generic sequence from Bio.Seq import Seq # Seq is a class defined in Bio.Seq file my_seq = Seq("CCCGGAGAGA") print(type(my_seq)) #let's see what attributes this object has print(my_seq) # It seems like there is a lot we can do with this object. # # The problem is at this point Python doesn't know what kind of sequence this is (DNA, RNA, Protein). # # We have to specify what kind of "Alphabet" the sequence belongs to. # In[3]: #right now it has just a generic alphabet print(my_seq.alphabet) # In[4]: from Bio.Alphabet import generic_dna, generic_protein, generic_rna my_dna = Seq("CCCGGAGAG", generic_dna) my_rna = Seq("ACCCGUUGU", generic_rna) my_protein = Seq("AKKKGGGUUULL", generic_protein) print(my_dna.alphabet) print(my_rna.alphabet) print(my_protein.alphabet) # Biopython will now know the difference for example between a DNA base `A` for adenine and the protein residue `A` for alanine. # # Now we can do some cool things. # # We can perform the main actions of the central dogma: transcribe and translate. # In[5]: my_gene = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", generic_dna) #get the mRNA my_transcript = my_gene.transcribe() print(my_transcript) #get the protein from the mRNA my_protein = my_transcript.translate() print(my_protein) # As you can see, we got some STOP codons represented as `*` and translation continued. # # We can get translation to actually stop when it encounters a STOP codon. # In[6]: myprot = my_transcript.translate(to_stop=True) print(myprot) # ### General sequence methods # # There are some convenient operations for dealing with sequences. # We can concatenate sequences if they are of matching type. # In[7]: seq1 = Seq("AAACGGA", generic_dna) seq2 = Seq("GGAGAT", generic_dna) print(seq1 + seq2) # We can also index and slice as though we had strings. # In[8]: print(seq1[:2] + seq2[-1]) # `Seq` objects are immutable, just like strings. # In[21]: seq1[2] = "G" # There is another type of object called a `MutableSeq`. If we want to support mutability we can convert a `Seq` object to a `MutableSeq` object quite easily. # In[22]: mut_seq = seq1.tomutable() mut_seq # In[23]: mut_seq[0] = "G" print(mut_seq) # We can also do searching inside sequences. # In[24]: myseq = Seq("CCAGAAACCCGGAA", generic_dna) #find the first occurence of the pattern print(myseq.find("GAA")) #find the number of non-overlapping occurrences of a pattern print(myseq.count("GAA")) # ## Class Inheritance from Existing Class # # We can write new classes that extend any existing class, including those defined in BioPython. # # Example: Define the MySeq class that extends the Seq class to add: # # 1. a list of confidence values (between 0 and 1) associated to each character in the sequence # 2. an average_confidence() method that computes the average confidence values for the sequence # 3. a gc_content() method that computes the fraction of bases that are either G or C # In[89]: from Bio.Seq import Seq class MySeq(Seq): def __init__(self, seq, conf): Seq.__init__(self, seq) self.confidence = conf # confidence values # Seq doesn't compute GC content so # we'll add that functionality def gc_content(self): return len([b for b in self if b in "GC"]) / len(self) def avg_confidence(self): return sum(self.confidence)/len(self.confidence) seq1 = Seq("ACGTATG") seq2 = MySeq("AAACG",[0.9, 0.8, 0.5, 1, 0.8]) print("GC content = ",seq2.gc_content()) print("Average confidence value = ", seq2.avg_confidence()) # # Sequence alignment # # Say you isolated some piece of DNA in the lab and obtained its sequence. # # You now want to find out which organism that sequence belongs to. # # You ask a biologist and they tell you to just "BLAST it". (recalled Lecture 14 miniBlast program) # # As you may know already, this is essentially the same as "Google it" for biologists. # # BLAST is an alignment algorithm that searches for your sequence of interest in a huge database of sequences whose origins are known. # # If you didn't know BioPython, you would take your sequence `CTCTCCCAGCTGAGCTATAGCCCCAATGCGCACATAATAAATCGTGTGAACGGGGCGCATGATATGAGAC` # and go to the [NCBI BLAST web server](https://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastn&PAGE_TYPE=BlastSearch&LINK_LOC=blasthome) and click on buttons like a monkey. # # Thanks to BioPython we can do this programatically! # # The `qblast` ('q' stands for query) method from the `Bio.Blast.NCBIWWW` module essentially sends our sequence to the BLAST web server. # # Here we are using the "nucleotice BLAST" algorithm so we say `blastn` and we are using it on the database of all nucleotide sequences, called `nt`. # # In[72]: from Bio.Blast import NCBIWWW s = Seq("CTCTCCCAGCTGAGCTATAGCCCCAATGCGCACATAATAAATCGTGTGAACGGGGCGCATGATATGAGAC", generic_dna) result_handle = NCBIWWW.qblast("blastn", "nt", s) # We wait a few seconds and get a `result_handle` which is like a temporary open file that we can read from. # # The format of this file is in XML so not easy to read, thankfully BioPython has an XML parser that extracts all the information for us. # In[73]: from Bio.Blast import NCBIXML blast_records = NCBIXML.parse(result_handle) # We get an "iterator" of BLAST record objects, or "search results". We can now loop over each of our search results and print some information. # # Here I am looping over all the results which each have an attribute `alignments` which are the alignments of our query sequence to some organism in the database. # # The `alignment` attribute itself has other attributes like the `query` sequence, the `length` and `title` of the matching organism.. # In[74]: for b in blast_records: for alignment in b.alignments: for hsp in alignment.hsps: print('****Alignment****') print('sequence:', alignment.title) print('length:', alignment.length) print('e value:', hsp.expect) print(hsp.query[0:75] + '...') print(hsp.match[0:75] + '...') print(hsp.sbjct[0:75] + '...') # # `SeqRecord` and `SeqIO` # # Often, sequences have some additional information associated to them. # # A good example was the BLAST exercise which gave us sequences associated to specific organisms, and genomic locations. # # Ideally we would want to be able to keep this information along with our `Seq` if we have it. # # BioPython lets us do this with the `SeqRecord` and `SeqIO` classes. # # # ## Parsing with `SeqIO` # # The `SeqIO` class which stands for Sequence Input/Output lets us read and write from various sequence annotation file formats which are common in biological databases. # # I went to the [GenBank](https://www.ncbi.nlm.nih.gov/genbank/) sequence database and looked for some sequence related to the Bubonic Plague (Yersinia Pestis bacteria) [here](https://www.ncbi.nlm.nih.gov/nuccore/NZ_ADRZ01000932.1?report=fasta). # # # # We get a `fasta` file which is very common for sequence annotations so BioPython can parse it for us automatically. # # The annotation for a FASTA sequence is typically held in the header: # # ``` # >NZ_ADRZ01000932.1 Yersinia pestis biovar Antiqua str. E1979001 Contig_E1979001_19275, whole genome shotgun sequence # ``` # # The `SeqIO.parse` takes a path to a file and a format, in this case "fasta" and produces an iteratror over each entry in the fasta file. # # Each item produced by the iterator is a `SeqRecords` object. # # In[75]: from Bio import SeqIO record = SeqIO.read("plague.fa", "fasta") print(record) # The `SeqRecords` object has some useful attributes. # # The object contains a `Seq` object witih the sequence info. # In[76]: print(record.seq) # In[77]: print(record) # In[78]: print(record.id) # In[79]: print(record.description) # We can also create our own `SeqRecord` objects. # In[80]: from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", IUPAC.protein), id="YP_025292.1", name="HokC", description="toxic membrane protein, small") print(record) # Then we can write it to FASTA format. # In[81]: record.format("fasta") # And then store it in a file. # # So now we know how to read sequence information, as well as produce our own sequence records and store them. # In[82]: f = open("myfasta.fa", "w") f.write(record.format("fasta")) f.close() # ## Making Multiple Sequence Alignments and Phylogenetic Trees # # Another thing biologists like to do is align (essentially, compare) sequences. # # We have seen multiple sequence alignment problem in our Assignment 2. # # Basically, we first align any pair of sequences and evaluate how they differ from each other based on the similarity score (i.e., similarity score = match scores + gap penalties + mismatch panalties). # # The similarity scores imply sequence phylogeny. # # We first compute a tree derived from the similarity between species in terms of their given sequences). # # We then predict how evolution took place to explain a particular set of observed sequences. # # # # ### Step 1: Obtain some sequences and align them # # Use the fasta file p53.fa found here: https://www.cs.mcgill.ca/~yueli/teaching/COMP204_Winter2019/Lectures/27/p53.fa # and use the ClustalW tool https://www.ebi.ac.uk/Tools/msa/clustalw2/ # to produce a multiple sequence alignment. The result is here: https://www.cs.mcgill.ca/~yueli/teaching/COMP204_Winter2019/Lectures/27/p53.aln # First install ClustalW by the following commandline in the Anaconda Prompt # # ``` # conda install -c bioconda clustalw # ``` # # ### Step 2: Align the sequences # In[83]: from Bio.Align.Applications import ClustalwCommandline in_file = "p53.fa" out_file = "p53.aln" cline = ClustalwCommandline("clustalw", infile=in_file, outfile=out_file) stdout, stderr = cline() print(stdout) # Now that we have `SeqRecord` objects we can align them. # # To do so, we need to put them in a `fasta` file. # # ### Step 3: Load the resulting alignment # In[84]: #open the alignmnent file from Bio import AlignIO aln = open("p53.aln", "r") #use AlignIO to read the alignment file in 'clustal' format alignment = AlignIO.read(aln, "clustal") print(alignment) # In[85]: #write our sequences to fasta format with open("seqs.fasta", "w") as f: SeqIO.write(seq_records, f, "fasta") # We can parse the alignment with the `AlignIO` module from BioPython. # # I won't go into detail on this module but it has a lot of useful funcionality for dealing with sequence alignments. # # ### Step 4: Use the alignment to obtain a "distance" between all pairs of sequences # # Using the parsed alignment, we can get the distance (or difference) between all the sequences. # # This just tells us, for each pair of sequences, how different they are. # In[86]: from Bio.Phylo.TreeConstruction import DistanceCalculator #calculate the distance matrix calculator = DistanceCalculator('identity') #adds distance matrix to the calculator object and returns it dm = calculator.get_distance(alignment) print(dm) # ### Step 5: Construct a tree from the set of distances # # And finally, we can construct a phylogenetic tree from the pairwise distances between all the sequences. # In[87]: from Bio.Phylo.TreeConstruction import DistanceTreeConstructor #initialize a DistanceTreeConstructor object based on our distance calculator object constructor = DistanceTreeConstructor(calculator) #build the tree upgma_tree = constructor.build_tree(alignment) # And let's use the `Phylo` module to visualize the result! # In[88]: from Bio import Phylo import pylab #draw the tree Phylo.draw(upgma_tree) # In[ ]: