Simple De Bruijn Graph implementation courtesy of Ben Langmead

(see the Sources document for original source and license of this code)

How to use this notebook:

  • 'activate' cells by clicking on them with the mouse (you will see a blinking cursor)
  • execute cells by pressing the ctrl and enter keys simultaneously
  • you can also execute code by pressing shift + enter, this will activate the next cell

Execute the first few cells until the one that generates the first plot. This will load the necessary python code.

In [1]:
class DeBruijnGraph:
    ''' De Bruijn directed multigraph built from a collection of
        strings. User supplies strings and k-mer length k.  Nodes
        are k-1-mers.  An Edge corresponds to the k-mer that joins
        a left k-1-mer to a right k-1-mer. '''
 
    @staticmethod
    def chop(st, k):
        ''' Chop string into k-mers of given length '''
        # workaround for the fact that length of the sequences
        # in the nodes was one less than size of k in the original implementation
        # as suggested by Katie Dean
        k = k + 1
        for i in xrange(0, len(st)-(k-1)):
            yield (st[i:i+k], st[i:i+k-1], st[i+1:i+k])
    
    class Node:
        ''' Node representing a k-1 mer.  Keep track of # of
            incoming/outgoing edges so it's easy to check for
            balanced, semi-balanced. '''
        
        def __init__(self, km1mer):
            self.km1mer = km1mer
            self.nin = 0
            self.nout = 0
        
        def isSemiBalanced(self):
            return abs(self.nin - self.nout) == 1
        
        def isBalanced(self):
            return self.nin == self.nout
        
        def __hash__(self):
            return hash(self.km1mer)
        
        def __str__(self):
            return self.km1mer
    
    def __init__(self, strIter, k, circularize=False):
        ''' Build de Bruijn multigraph given string iterator and k-mer
            length k '''
        self.G = {}     # multimap from nodes to neighbors
        self.nodes = {} # maps k-1-mers to Node objects
        for st in strIter:
            if circularize:
                st += st[:k-1]
            for kmer, km1L, km1R in self.chop(st, k):
                nodeL, nodeR = None, None
                if km1L in self.nodes:
                    nodeL = self.nodes[km1L]
                else:
                    nodeL = self.nodes[km1L] = self.Node(km1L)
                if km1R in self.nodes:
                    nodeR = self.nodes[km1R]
                else:
                    nodeR = self.nodes[km1R] = self.Node(km1R)
                nodeL.nout += 1
                nodeR.nin += 1
                self.G.setdefault(nodeL, []).append(nodeR)
        # Iterate over nodes; tally # balanced, semi-balanced, neither
        self.nsemi, self.nbal, self.nneither = 0, 0, 0
        # Keep track of head and tail nodes in the case of a graph with
        # Eularian walk (not cycle)
        self.head, self.tail = None, None
        for node in self.nodes.itervalues():
            if node.isBalanced():
                self.nbal += 1
            elif node.isSemiBalanced():
                if node.nin == node.nout + 1:
                    self.tail = node
                if node.nin == node.nout - 1:
                    self.head = node
                self.nsemi += 1
            else:
                self.nneither += 1
    
    def nnodes(self):
        ''' Return # nodes '''
        return len(self.nodes)
    
    def nedges(self):
        ''' Return # edges '''
        return len(self.G)
    
    def hasEulerianWalk(self):
        ''' Return true iff graph has Eulerian walk. '''
        return self.nneither == 0 and self.nsemi == 2
    
    def hasEulerianCycle(self):
        ''' Return true iff graph has Eulerian cycle. '''
        return self.nneither == 0 and self.nsemi == 0
    
    def isEulerian(self):
        ''' Return true iff graph has Eulerian walk or cycle '''
        # technically, if it has an Eulerian walk
        return self.hasEulerianWalk() or self.hasEulerianCycle()
    
    def eulerianWalkOrCycle(self):
        ''' Find and return sequence of nodes (represented by
            their k-1-mer labels) corresponding to Eulerian walk
            or cycle '''
        assert self.isEulerian()
        g = self.G
        if self.hasEulerianWalk():
            g = g.copy()
            g.setdefault(self.tail, []).append(self.head)
        # graph g has an Eulerian cycle
        tour = []
        src = g.iterkeys().next() # pick arbitrary starting node
        
        def __visit(n):
            while len(g[n]) > 0:
                dst = g[n].pop()
                __visit(dst)
            tour.append(n)
        
        __visit(src)
        tour = tour[::-1][:-1] # reverse and then take all but last node
            
        if self.hasEulerianWalk():
            # Adjust node list so that it starts at head and ends at tail
            sti = tour.index(self.head)
            tour = tour[sti:] + tour[:sti]
        
        # Return node list
        return map(str, tour)
In [2]:
class DeBruijnPlot(DeBruijnGraph):
    def to_dot(self, weights=False):
        """ Write dot representation to given filehandle.  If 'weights'
            is true, label edges corresponding to distinct k-1-mers
            with weights, instead of writing a separate edge for each
            copy of a k-1-mer. """
        dot_str = []
        dot_str.append("digraph \"DeBruijn graph\" {\n")
        dot_str.append("  bgcolor=\"transparent\";\n")
        for node in self.G.iterkeys():
            lab = node.km1mer
            dot_str.append("  %s [label=\"%s\"] ;\n" % (lab, lab))
        for src, dsts in self.G.iteritems():
            srclab = src.km1mer
            if weights:
                weightmap = {}
                if weights:
                    for dst in dsts:
                        weightmap[dst] = weightmap.get(dst, 0) + 1
                for dst, v in weightmap.iteritems():
                    dstlab = dst.km1mer
                    dot_str.append("  %s -> %s [label=\"%d\"] ;\n" % (srclab, dstlab, v))
            else:
                for dst in dsts:
                    srclab = src.km1mer
                    dstlab = dst.km1mer
                    dot_str.append("  %s -> %s [label=\"\"] ;\n" % (srclab, dstlab))
        dot_str.append("}\n")
        return ''.join(dot_str)
In [3]:
%install_ext https://raw.githubusercontent.com/cjdrake/ipython-magic/master/gvmagic.py
Installed gvmagic.py. To use it, type:
  %load_ext gvmagic
In [4]:
%load_ext gvmagic

Exercise 1: simple De Bruijn graph

Do the exercise by execute the code cell below.

Let's first generate a really simple De Bruijn graph of a short read 'GCTGATCGATTT' with kmer size 4

  • study the graph and see if it indeed encodes the read
  • now change the kmer size to 3 and again trace the read: what happened?
In [6]:
%dotobj DeBruijnPlot(['GCTGATCGATTT'], 3)
DeBruijn graph ATC ATC TCG TCG ATC->TCG CGA CGA GAT GAT CGA->GAT GAT->ATC ATT ATT GAT->ATT TTT TTT ATT->TTT CTG CTG TGA TGA CTG->TGA GCT GCT GCT->CTG TCG->CGA TGA->GAT

Exercise 2: simple overlap

The following command will generate the De Bruijn graph with k = 4 for two reads that have some overlap:

GCTGATCGATTT  
      CGATTTTCGGCGAA
  • find the two reads again in the graph: which nodes represent the overlap?
  • change k from 4 to 5, to 6, to 7, to 8, what happens?
In [ ]:
%dotobj DeBruijnPlot(['GCTGATCGATTT', 'CGATTTTCGGCGAA'], 4)

Exercise 3: overlap and an internal repeat

  • these two reads overlap, but also have short repeat: the sequence ATCGA is present in both reads
GCTGATCGATTT
      CGATTTTATCGAAA
  • what happened with the repeat? And the overlap?
  • change k from 4 to 5, to 6, to 7, to 8, what happens?
In [ ]:
%dotobj DeBruijnPlot(['GCTGATCGATTT', 'CGATTTTATCGAAA'], 4)

Exercise 4: resolving a repeat

The following sequence has two copies of the ATCGA repeat

Decrease and increase the kmer size stepwise

  • at what kmer size is the repeat resolved?
  • why this kmer size?
In [8]:
%dotobj DeBruijnPlot(['GCTGATCGATATGGATTTTATCGAAAAGTCGTAGTC'], 5)
DeBruijn graph GGATT GGATT GATTT GATTT GGATT->GATTT GTCGT GTCGT TCGTA TCGTA GTCGT->TCGTA TATGG TATGG ATGGA ATGGA TATGG->ATGGA TGATC TGATC GATCG GATCG TGATC->GATCG GTAGT GTAGT TAGTC TAGTC GTAGT->TAGTC AGTCG AGTCG AGTCG->GTCGT CGTAG CGTAG TCGTA->CGTAG ATCGA ATCGA GATCG->ATCGA TTTTA TTTTA TTTAT TTTAT TTTTA->TTTAT TGGAT TGGAT TGGAT->GGATT GATAT GATAT ATATG ATATG GATAT->ATATG CGATA CGATA CGATA->GATAT CTGAT CTGAT CTGAT->TGATC AAGTC AAGTC AAGTC->AGTCG CGAAA CGAAA GAAAA GAAAA CGAAA->GAAAA ATGGA->TGGAT TCGAA TCGAA TCGAA->CGAAA ATATG->TATGG GCTGA GCTGA GCTGA->CTGAT ATTTT ATTTT GATTT->ATTTT TCGAT TCGAT TCGAT->CGATA TTATC TTATC TTTAT->TTATC TATCG TATCG TTATC->TATCG ATTTT->TTTTA AAAAG AAAAG GAAAA->AAAAG AAAGT AAAGT AAAGT->AAGTC CGTAG->GTAGT ATCGA->TCGAA ATCGA->TCGAT TATCG->ATCGA AAAAG->AAAGT

Exercise 5: read with error

  • these two reads are identical, except for a one base 'error' in the second one
GCTGATCGATTT
GCTGATAGATTT
  • create the graph with kmer size 5 and 5 copies of the first read
  • add one copy of the second read (again, kmer size 5)
  • explain the graph
  • increase the kmer size
In [10]:
%dotobj DeBruijnPlot(['GCTGATCGATTT','GCTGATCGATTT','GCTGATCGATTT','GCTGATCGATTT','GCTGATCGATTT','GCTGATAGATTT'], 5)
DeBruijn graph CTGAT CTGAT TGATA TGATA CTGAT->TGATA TGATC TGATC CTGAT->TGATC CTGAT->TGATC CTGAT->TGATC CTGAT->TGATC CTGAT->TGATC ATAGA ATAGA TAGAT TAGAT ATAGA->TAGAT GATAG GATAG TGATA->GATAG GATAG->ATAGA GATCG GATCG TGATC->GATCG TGATC->GATCG TGATC->GATCG TGATC->GATCG TGATC->GATCG GCTGA GCTGA GCTGA->CTGAT GCTGA->CTGAT GCTGA->CTGAT GCTGA->CTGAT GCTGA->CTGAT GCTGA->CTGAT AGATT AGATT TAGAT->AGATT CGATT CGATT GATTT GATTT CGATT->GATTT CGATT->GATTT CGATT->GATTT CGATT->GATTT CGATT->GATTT AGATT->GATTT ATCGA ATCGA GATCG->ATCGA GATCG->ATCGA GATCG->ATCGA GATCG->ATCGA GATCG->ATCGA TCGAT TCGAT ATCGA->TCGAT ATCGA->TCGAT ATCGA->TCGAT ATCGA->TCGAT ATCGA->TCGAT TCGAT->CGATT TCGAT->CGATT TCGAT->CGATT TCGAT->CGATT TCGAT->CGATT