#!/usr/bin/env python # coding: utf-8 # # Genome Assembly for Hackers # # *This notebook first appeared as a [blog post](//betatim.github.io/posts/genome-hackers) on [Tim Head](//betatim.github.io)'s blog.* # # *License: [MIT](http://opensource.org/licenses/MIT)* # # *(C) 2016, Tim Head.* # *Feel free to use, distribute, and modify with the above attribution.* # This post collects code snippets I created while learning about genome assembly. # I understand complicated things through code. I build simulations or little # tools and they help me immensely to understand how something works. Probably # because it uses vocabulary I am familiar with (python!) instead of domain # specific jargon. # # For another great example of "understanding X through code" check out [Jake VanderPlas'](https://twitter.com/jakevdp) talk [Statistics for hackers](https://speakerdeck.com/jakevdp/statistics-for-hackers) ([video](https://www.youtube.com/watch?v=-7I7MWTX0gA)). # # This post covers the basic basics of "genome assembly". Starting with a genome, # creating "reads" from it, breaking these reads up into "kmers", and then # doing the actual assembly. There are a lot of subtleties that I gloss over, # ignored details and ouright shortcuts (hashtag-physics-style). Please biology # friends, don't hate me :) # # I assume you have done some reading about this topic and came here to see it # translated into code. I link to various wikipedia articles through out, but # otherwise will not try and explain the biology reasoning behind all this (not just # because I don't know it myself ...). # # Let's start with generating a random "genome". # In[1]: import copy import string import random from itertools import product from collections import defaultdict import numpy as np random.seed(123) np.random.seed(321) # ## The Genome # # We will use a fairly short genome made up totally at random. In reality # genomes have millions or billions of base pairs and contain structure. # Welcome to the world of ["spherical cows in a vacuum"](https://en.wikipedia.org/wiki/Spherical_cow). # In[2]: genome = "".join(random.choice("AGCT") for _ in range(1000)) # In[3]: # this is a great genome, trust me genome # ## Reading # # To get the genome out of a cell you need to read it. This is called [sequencing](https://en.wikipedia.org/wiki/Sequencing). While there are many methods to do # this in an actual lab, in practice they all create "reads". These are short sections # of the genome that cover a random part of it. To start with let's assume we can # generate these reads without making any mistakes. # In[4]: def perfect_reads(genome, n_reads=10): """Create perfect reads from `genome`""" starts = np.random.randint(len(genome), size=n_reads) length = np.random.randint(27,33, size=n_reads) for n in range(n_reads): low = starts[n] yield genome[low:low + length[n]] # In[5]: list(perfect_reads(genome, 3)) # Reads can vary in size and come without information about where # in the genome they took place. You can end up with duplicate reads # of parts of reads overlapping. This is a good thing. These overlaps # is what will allow us to piece everything back together. Read more # about [Shotgun sequencing](https://en.wikipedia.org/wiki/Shotgun_sequencing) # # _NB:_ Today most real world sequencing is done using ["double barrel" # shotguns](https://en.wikipedia.org/wiki/Shotgun_sequencing#Whole_genome_shotgun_sequencing). The key realisation is that you can create reads from both ends of # a fragment of the genome. This way you gain some information about the # direction of the reads relative to each other and how far they are apart. # # Next each read is turned into a [`k-mer`](https://en.wikipedia.org/wiki/K-mer). # These are what is used during the actual assembly. `k-mer`s are the substrings of # length `k` that you can generate from a string. # In[6]: def kmers(read, k=10): """Generate `k`-mers from a `read`""" for n in range(len(read) - k + 1): yield read[n:n+k] # In[7]: def get_perfect_kmers(genome): kmers_ = [] for read in perfect_reads(genome, n_reads=1000): for kmer in kmers(read): kmers_.append(kmer) return kmers_ # In[8]: kmers_ = get_perfect_kmers(genome) # lots of kmers, but not that many are unique print(len(kmers_), len(set(kmers_))) # With perfect reads from our random genome you end up with a # large number of `k-mers` (of length ten) but most of them # are duplicates. The roughly 20000 `k-mers` we created only # contain about 1000 unique `k-mers`. # # # ## Flipping reads # # In reality the sequencing process is not perfect and will # make mistakes. The simplest way to illustrate this is to # randomly flip a base in a read. Even fairly low error rates # of 1% per base have a large effect on the number of unique `k-mers` # we find. # In[9]: def flip(read, p=0.01): """Flip a base to one of the other bases with probability `p`""" bases = [] for b in read: if np.random.uniform() <= p: bases.append(random.choice("AGCT".replace(b, ""))) else: bases.append(b) return "".join(bases) def reads_with_errors(genome, n_reads=10): """Generate reads where bases might be flipped.""" starts = np.random.randint(len(genome), size=n_reads) length = np.random.randint(27,33, size=n_reads) for n in range(n_reads): low = starts[n] yield flip(genome[low:low + length[n]]) # In[10]: def get_kmers(genome): kmers_ = [] for read in reads_with_errors(genome, n_reads=1000): for kmer in kmers(read): kmers_.append(kmer) return kmers_ kmers_ = get_kmers(genome) # roughly same number of kmers, but a lot more unique ones print(len(kmers_), len(set(kmers_))) # We still generate roughly 20000 `k-mers` but now there are # three times as many unique `k-mers`. This is one of the factors # that makes assembling a genome so hard. # # So now that we have a super simplistic model for generating # `k-mers`, how do we put them back together? # # Assembly # # Left as an exercise for the reader. # # Just kidding. Assembly happens by creating # a [de Bruijn graph](https://en.wikipedia.org/wiki/De_Bruijn_graph) # from our `k-mers`. Importantly each `k-mer` is represented # by an **edge** in the graph, not a node! Nodes are for `k-1-mers`. # If we have one 4-mer, `abcd`, the graph would look like this: `abc -> bcd`. # The first node represents the first `n-1` symbols in the `k-mer` and # the second node represents the last `n-1` symbols. # # We will leave the world of biology behind for a moment and create # completely artificial examples. I found this easier to understand # and debug. # # First we will need something that can turn a string into a graph. # Using `k=2` the `make_graph` function will do just that. # In[11]: def make_graph(string, k): k_mers = list(kmers(string, k)) nodes = defaultdict(list) for kmer in k_mers: head = kmer[:-1] tail = kmer[1:] nodes[head].append(tail) return nodes nodes = make_graph('abcbdexdbfga', 2) print(nodes) # `nodes` is a dictionary mapping each node to the list # of outgoing edges for that node. Its printed representation # is quite ugly, so here is a graphical version of it: # # # # You can render this with `graphviz` but this is not # installed on tmpbnb.org so will fail here. # In[12]: from graphviz import Digraph dot = Digraph(comment='debruijn') for km1mer in nodes: dot.node(km1mer, km1mer) for src in nodes: ends = nodes[src] for end in ends: dot.edge(src, end) dot.format = 'png' dot # ## Eulerian walks # # Why create this weird graph structure from our reads? At # first it seems like it makes things more complicated. Actually # this is an example of transforming your problem to make it # easier. # # To reconstruct the original string from this graph all we # have to do is find a trip along all the edges of the graph # that visits each edge only once. If we can do that, we # are done. People who are into graph theory call this # an eulerian walk. If you grew up in Germany you might # recognise this as solving the [kids puzzle](https://de.wikipedia.org/wiki/Haus_vom_Nikolaus) "Das ist das # Haus vom Ni-ko-laus." # # #
Von SoylentGreen - Eigenes Werk, CC BY-SA 3.0, https://commons.wikimedia.org/w/index.php?curid=2308144