#!/usr/bin/env python # coding: utf-8 # # An introduction to k-mers for genome comparison and analysis # k-mers provide sensitive and specific methods for comparing and analyzing genomes. # # This notebook provides pure Python implementations of some of the basic k-mer comparison techniques implemented in sourmash, including hash-based subsampling techniques. # # ### Running this notebook. # # You can run this notebook interactively via mybinder; click on this button: # [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/dib-lab/sourmash/master?filepath=doc%2Fkmers-and-minhash.ipynb) # # A rendered version of this notebook is available at [sourmash.readthedocs.io](https://sourmash.readthedocs.io) under "Tutorials and notebooks". # # You can also get this notebook from the [doc/ subdirectory of the sourmash github repository](https://github.com/dib-lab/sourmash/tree/master/doc). See [binder/environment.yaml](https://github.com/dib-lab/sourmash/blob/master/binder/environment.yml) for installation dependencies. # # ### What is this? # # This is a Jupyter Notebook using Python 3. If you are running this via [binder](https://mybinder.org), you can use Shift-ENTER to run cells, and double click on code cells to edit them. # # Contact: C. Titus Brown, ctbrown@ucdavis.edu. Please [file issues on GitHub](https://github.com/dib-lab/sourmash/issues/) if you have any questions or comments! # ## Calculating Jaccard similarity and containment # # Given any two collections of k-mers, we can calculate similarity and containment using the union and intersection functionality in Python. # In[1]: def jaccard_similarity(a, b): a = set(a) b = set(b) intersection = len(a.intersection(b)) union = len(a.union(b)) return intersection / union # In[2]: def jaccard_containment(a, b): a = set(a) b = set(b) intersection = len(a.intersection(b)) return intersection / len(a) # ## Let's try these functions out on some simple examples! # In[3]: a = ['ATGG', 'AACC'] b = ['ATGG', 'CACA'] c = ['ATGC', 'CACA'] # In[4]: jaccard_similarity(a, a) # In[5]: jaccard_containment(a, a) # In[6]: jaccard_similarity(b, a) # In[7]: jaccard_similarity(a, c) # In[8]: jaccard_containment(b, a) # In[9]: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib_venn import venn2, venn3 venn2([set(a), set(b)]) # In[10]: venn3([set(a), set(b), set(c)]) # ## Calculating k-mers from DNA sequences # # To extract k-mers from DNA sequences, we walk over the sequence with a sliding window: # In[11]: def build_kmers(sequence, ksize): kmers = [] n_kmers = len(sequence) - ksize + 1 for i in range(n_kmers): kmer = sequence[i:i + ksize] kmers.append(kmer) return kmers # In[12]: build_kmers('ATGGACCAGATATAGGGAGAGCCAGGTAGGACA', 21) # In the k-mers that are output, you can see how the sequence shifts to the right - look at the pattern in the middle. # # So, now, you can compare two sequences! # In[13]: seq1 = 'ATGGACCAGATATAGGGAGAGCCAGGTAGGACA' seq2 = 'ATGGACCAGATATTGGGAGAGCCGGGTAGGACA' # differences: ^ ^ # In[14]: K = 10 kmers1 = build_kmers(seq1, K) kmers2 = build_kmers(seq2, K) print(K, jaccard_similarity(kmers1, kmers2)) # ## Reading k-mers in from a file # # In practice, we often need to work with 100s of thousands of k-mers, and this means loading them in from sequences in files. # # There are three cut-down genome files in the `genomes/` directory that we will use below: # # ``` # akkermansia.fa # shew_os185.fa # shew_os223.fa # ``` # The latter two are two strains of *Shewanella baltica*, and the first one is an unrelated genome *Akkermansia muciniphila*. # In[15]: import screed # a library for reading in FASTA/FASTQ def read_kmers_from_file(filename, ksize): all_kmers = [] for record in screed.open(filename): sequence = record.sequence kmers = build_kmers(sequence, ksize) all_kmers += kmers return all_kmers # In[16]: akker_kmers = read_kmers_from_file('genomes/akkermansia.fa', 31) # In[17]: akker_kmers[:5] # In[18]: print(len(akker_kmers)) # In[19]: shew1_kmers = read_kmers_from_file('genomes/shew_os185.fa', 31) shew2_kmers = read_kmers_from_file('genomes/shew_os223.fa', 31) # We can see the relationship between these three like so: # In[20]: print('akker vs shew1', jaccard_similarity(akker_kmers, shew1_kmers)) print('akker vs shew2', jaccard_similarity(akker_kmers, shew2_kmers)) print('shew1 vs shew2', jaccard_similarity(shew1_kmers, shew2_kmers)) # In[21]: print('akker vs shew1', jaccard_containment(akker_kmers, shew1_kmers)) print('akker vs shew2', jaccard_containment(akker_kmers, shew2_kmers)) print('shew1 vs shew2', jaccard_containment(shew1_kmers, shew2_kmers)) # In[22]: venn3([set(akker_kmers), set(shew1_kmers), set(shew2_kmers)]) # # Let's hash! # ## Choose a hash function! # # We need to pick a hash function that takes DNA k-mers and converts them into numbers. # # Both the [mash](https://mash.readthedocs.io/en/latest/) software for MinHash, and the [sourmash](https://sourmash.readthedocs.io) software for modulo and MinHash, use MurmurHash: # # https://en.wikipedia.org/wiki/MurmurHash # # this is implemented in the 'mmh3' library in Python. # # The other thing we need to do here is take into account the fact that DNA is double stranded, and so # # ``` # hash_kmer('ATGG') # ``` # should be equivalent to # ``` # hash_kmer('CCAT') # ``` # Following mash's lead, for every input k-mer we will choose a *canonical* k-mer that is the lesser of the k-mer and its reverse complement. # In[23]: import mmh3 def hash_kmer(kmer): # calculate the reverse complement rc_kmer = screed.rc(kmer) # determine whether original k-mer or reverse complement is lesser if kmer < rc_kmer: canonical_kmer = kmer else: canonical_kmer = rc_kmer # calculate murmurhash using a hash seed of 42 hash = mmh3.hash64(canonical_kmer, 42)[0] if hash < 0: hash += 2**64 # done return hash # This is now a function that we can use to turn any DNA "word" into a number: # In[24]: hash_kmer('ATGGC') # The same input word always returns the same number: # In[25]: hash_kmer('ATGGC') # as does its reverse complement: # In[26]: hash_kmer('GCCAT') # and nearby words return very different numbers: # In[27]: hash_kmer('GCCAA') # ## Note that hashing collections of k-mers doesn't change Jaccard calculations: # In[28]: def hash_kmers(kmers): hashes = [] for kmer in kmers: hashes.append(hash_kmer(kmer)) return hashes # In[29]: shew1_hashes = hash_kmers(shew1_kmers) shew2_hashes = hash_kmers(shew2_kmers) # In[30]: print(jaccard_similarity(shew1_kmers, shew2_kmers)) # In[31]: print(jaccard_similarity(shew1_hashes, shew2_hashes)) # (ok, it changes it a little, because of the canonical k-mer calculation!) # ## Implementing subsampling with modulo hashing # # We are now ready to implement k-mer subsampling with modulo hash. # # We need to pick a sampling rate, and know the maximum possible hash value. # # For a sampling rate, let's start with 1000. # # The MurmurHash function turns k-mers into numbers between 0 and `2**64 - 1` (the maximum 64-bit number). # # Let's define these as variables: # In[32]: scaled = 1000 MAX_HASH = 2**64 # Now, choose the range of hash values that we'll keep. # In[33]: keep_below = MAX_HASH / scaled print(keep_below) # and write a filter function: # In[34]: def subsample_modulo(kmers): keep = [] for kmer in kmers: if hash_kmer(kmer) < keep_below: keep.append(kmer) # otherwise, discard return keep # ## Now let's apply this to our big collections of k-mers! # In[35]: akker_sub = subsample_modulo(akker_kmers) shew1_sub = subsample_modulo(shew1_kmers) shew2_sub = subsample_modulo(shew2_kmers) # In[36]: print(len(akker_kmers), len(akker_sub)) print(len(shew1_kmers), len(shew1_sub)) print(len(shew2_kmers), len(shew2_sub)) # So we go from ~500,000 k-mers to ~500 hashes! Do the Jaccard calculations change?? # In[37]: print('akker vs akker, total', jaccard_similarity(akker_kmers, akker_kmers)) print('akker vs akker, sub', jaccard_similarity(akker_sub, akker_sub)) # In[38]: print('akker vs shew1, total', jaccard_similarity(akker_kmers, shew1_kmers)) print('akker vs shew1, sub', jaccard_similarity(akker_sub, shew1_sub)) # In[39]: print('shew1 vs shew2, total', jaccard_similarity(shew1_kmers, shew2_kmers)) print('shew1 vs shew2, sub', jaccard_similarity(shew1_sub, shew2_sub)) # And you can see that the numbers are different, but not very much - the Jaccard similarity is being *estimated*, so it is not exact but it is close. # ## Let's visualize -- # In[40]: venn3([set(akker_kmers), set(shew1_kmers), set(shew2_kmers)]) # In[41]: venn3([set(akker_sub), set(shew1_sub), set(shew2_sub)]) # ## A full list of notebooks # # [An introduction to k-mers for genome comparison and analysis](kmers-and-minhash.ipynb) # # [Some sourmash command line examples!](sourmash-examples.ipynb) # # [Working with private collections of signatures.](sourmash-collections.ipynb)