#!/usr/bin/env python # coding: utf-8 # ### Min hashing and Jaccard similarity # # #### Introduction # # There are many ways to measure how similar two strings are: [Hamming distance](http://nbviewer.ipython.org/github/BenLangmead/comp-genomics-class/blob/master/notebooks/CG_DP_EditDist.ipynb), [edit distance](http://nbviewer.ipython.org/github/BenLangmead/comp-genomics-class/blob/master/notebooks/CG_DP_EditDist.ipynb) and [global alignment value](http://nbviewer.ipython.org/github/BenLangmead/comp-genomics-class/blob/master/notebooks/CG_DP_Global.ipynb) for example. Another way is to turn each string into a set, e.g. the set of its constituent $k$-mers, then consider how similar the sets are. # # We define a function that, given a string, returns the set of its constituent $k$-mers. # In[1]: def string_to_kmer_set(Astr, k): return set([Astr[i:i+k] for i in range(len(Astr)-k+1)]) # In[2]: string_to_kmer_set("hello", 3) # The Jaccard similarity coefficient $J(A, B)$ of non-empty sets $A$ and $B$ is: # # $$\frac{|A \cap B|}{|A \cup B|}$$ # # It equals 1 when the sets are identical and 0 when they are disjoint. Otherwise it is between 0 and 1. # In[3]: def jaccard(Aset, Bset): # return Jaccard similarity coefficient between two sets isz = len(Aset.intersection(Bset)) return float(isz) / (len(Aset) + len(Bset) - isz) # In[4]: def jaccard_kmer(Astr, Bstr, k): # turn two strings into sets, then return Jaccard similarity coefficient of those sets return jaccard(string_to_kmer_set(Astr, k), string_to_kmer_set(Bstr, k)) # In[5]: jaccard_kmer("ABC", "ABD", 2) # intersection: {AB}, union: {AB, BC, BD} # so answer = 1/3 # #### Evaluating use of sets # Explicitly building and intersecting sets of strings seems inefficient. Let's see how long it takes to run on many randomly generated pairs of similar strings. # In[6]: import random def add_mutations(string, num_muts): """ Add num_muts random substitution mutations to string """ for _ in range(num_muts): rndi = random.randint(0, len(string)-1) string = string[:rndi] + random.choice('ACGT') + string[rndi+1:] return string def random_jaccard_kmer(length, k): """ Make a random string and a second string separated from the first by a few mutations, then return the two strings and their jaccard similarity coefficient. """ str1 = ''.join([random.choice('ACGT') for _ in range(length)]) str2 = add_mutations(str1, random.randint(0, float(length)/20)) return str1, str2, jaccard_kmer(str1, str2, k) # In[7]: random.seed(77) add_mutations('ACGTACGT', 2) # In[8]: random.seed(76) random_jaccard_kmer(20, 4) # In[9]: import timeit timeit.timeit('random_jaccard_kmer(1000, 10)', setup=''' from __main__ import random_jaccard_kmer; import random; random.seed(223)''', number=10000) # It takes >10 seconds to find Jaccard similarities between 10,000 random pairs of 100-long DNA strings, using $k$-mer length of 10. # ### Min hashing # # #### Introduction # # How about: instead of using the set of all $k$-mers from each string, we pick one representative $k$-mer from each string. Let's pick the *minimum alphabetically*. For example: # In[10]: def string_to_min_kmer(Astr, k): return min([Astr[i:i+k] for i in range(len(Astr)-k+1)]) # In[11]: string_to_min_kmer("hello", 3) # We can compare two strings by comparing their minimal $k$-mers: # In[12]: def jaccard_min_kmer(Astr, Bstr, k): return 1 if string_to_min_kmer(Astr, k) == string_to_min_kmer(Bstr, k) else 0 # In[13]: jaccard_min_kmer("ABC", "ABD", 2) # In[14]: jaccard_min_kmer("ABC", "ACB", 2) # In[15]: jaccard_min_kmer("DBC", "ABC", 2) # This can yield a Jaccard similarity of 0 or 1; we cannot distinguish intermedaite amounts of similarity. On the other hand, we avoided building any sets. # # #### Adding a hash function # # We'll use the [mmh3 library], which contains an implementation of [MurmurHash3], a fast and widely used non-cryptographic hash function. Instead of taking our representative as being the minimal $k$-mer, we'll first *hash* the $k$-mers, then take the $k$-mer with minimal *hash value*: # # [MurmurHash3]: https://code.google.com/p/smhasher/wiki/MurmurHash3 # [mmh3 library]: https://pypi.python.org/pypi/mmh3 # In[16]: # you might need to 'pip install mmh3' first import mmh3 # In[17]: def string_to_min_hash(Astr, k): return min([mmh3.hash (Astr[i:i+k]) for i in range(len(Astr)-k+1)]) # In[18]: string_to_min_hash("hello", 3) # That's the minimum among the hash values of the 3-mers of "hello". # In[19]: def jaccard_min_kmer_hash(Astr, Bstr, k): return 1 if string_to_min_hash(Astr, k) == string_to_min_hash(Bstr, k) else 0 # In[20]: jaccard_min_kmer_hash("ABC", "ABD", 2) # In[21]: jaccard_min_kmer_hash("ABC", "ACB", 2) # In[22]: jaccard_min_kmer_hash("DBC", "ABC", 2) # `jaccard_min_kmer_hash`'s return value won't necessarily match `jaccard_min_kmer`'s, since the function permutes the alphabetical order of the $k$-mers. # #### Multiple hash functions # `jaccard_min_kmer` and `jaccard_min_kmer_hash` return 0 or 1 -- not very precise! We can get a better estimate by calling `jaccard_min_kmer_hash` multiple times, each time using a different hash function. # # Let's rewrite `string_to_min_hash` to include a `seed` parameter that ["salts"] the hash function. # # ["salts"]: http://en.wikipedia.org/wiki/Salt_(cryptography) # In[23]: def string_to_min_hash(Astr, k, seed=0): return min([mmh3.hash(Astr[i:i+k], seed) for i in range(len(Astr)-k+1)]) # Now we can call `string_to_min_hash` with various hash functions, or various saltings of the same function. # In[24]: [string_to_min_hash("hello", 3, k) for k in range(10, 20)] # In[25]: def jaccard_min_kmer_hashes(Astr, Bstr, k, seeds=[0]): tot = sum(string_to_min_hash(Astr, k, seed) == string_to_min_hash(Bstr, k, seed) for seed in seeds) return float(tot) / len(seeds) # In[26]: jaccard_min_kmer_hashes("ABC", "ABD", 2, seeds=range(10)) # Not a terrible estimate. # In[27]: jaccard_min_kmer_hashes("ABC", "ABD", 2, seeds=range(100)) # Again, not terrible. # In[28]: jaccard_min_kmer_hashes("ABC", "ABD", 2, seeds=range(10000)) # A very good estimate: off by only about 1%. # Why does this function give an estimate of Jaccard similarity? Each hash function *permutes* the ordering of the $k$-mers differently. For each permutation, some $k$-mer from the union of all $k$-mers becomes the minimal one. By calculating the fraction of the hash functions for which this minimal $k$-mer is present in both sets, we're estimating the size of the intersection divided by the size of the union: the Jaccard similarity. # In[29]: def random_jaccard_kmer(length, k): str1 = ''.join([random.choice('ACGT') for _ in range(length)]) str2 = add_mutations(str1, random.randint(0, length/20)) return str1, str2, jaccard_min_kmer_hashes(str1, str2, k, seeds=range(10)) import timeit timeit.timeit('random_jaccard_kmer(1000, 10)', setup=''' from __main__ import random_jaccard_kmer; import random; random.seed(223)''', number=10000) # It's slower than what we had before, but for large enough sets it could be faster.