To calculate the similarity of two sets, we can use the Jaccard index, which divides the size of the sets' intersection by the size of their union. As with the other problems we've discussed so far, keeping explicit representations of sets around is intractable for very large sets, but it is also intractable if we have very many sets, for example, if we're building a search engine. We would like a way to construct signatures of sets in such a way that we can calculate their approximate similarity scalably.
Minhash is a technique for constructing signatures of sets that will allow us to estimate their approximate similarity. Here's the basic technique, which tracks document signatures by keeping track of the minimum value seen for multiple hash functions across every element in the set.
import numpy as np
import random
import pickle
from sklearn.utils.murmurhash import murmurhash3_bytes_u32 as mhash
def murmurmaker(seed):
"""
return a function to calculate a 32-bit murmurhash of v
(an object or bytes), using the given seed
"""
def m(v):
bvalue = None
if type(v) == str:
bvalue = v.encode("utf-8")
elif hasattr(v, "to_bytes"):
bvalue = v.to_bytes(128, "big")
elif type(v) == bytes:
bvalue = v
else:
bvalue = pickle.dumps(v)
return mhash(bvalue, seed=seed)
return m
class SimpleMinhash(object):
""" This is a very basic implementation of minhash """
def __init__(self, hashes):
rng = np.random.RandomState(seed=int.from_bytes(b"rad!", "big"))
self.buckets = np.full(hashes, (1 << 32) - 1)
self.hashes = [murmurmaker(seed) for seed in rng.randint(0, (1<<32) - 1, hashes)]
def add(self, obj):
self.buckets = np.minimum(self.buckets, [h(obj) for h in self.hashes])
def similarity(self, other):
""" """
return np.count_nonzero(self.buckets==other.buckets) / float(len(self.buckets))
def merge(self, other):
""" returns a newly-allocated minhash structure containing
the merge of this hash and another """
result = SimpleMinhash(0)
result.buckets = np.minimum(self.buckets, other.buckets)
result.hashes = self.hashes
return result
We can test a small Minhash with random values to see how well the approximate Jaccard index implementation works.
def test_minhash(count=50000, expected_percentage=.20):
m1 = SimpleMinhash(1024)
m2 = SimpleMinhash(1024)
for i in range(count):
bits = random.getrandbits(64).to_bytes(8, "big")
if i % 1000 < (1000 * expected_percentage):
m1.add(bits)
m2.add(bits)
elif i % 2 == 0:
m1.add(bits)
else:
m2.add(bits)
return m1.similarity(m2)
test_minhash()
A very common application for these kinds of document signatures is identifying similar documents based on the words that they contain -- this is useful, e.g., for detecting plagiarized prose or grouping similar web pages or news articles together. Unfortunately, even having an efficient way to calculate pairwise similarities is insufficient for this application: it doesn't matter how cheap it is to do a pairwise comparison if we have to compare every pair in a large document collection! We can use locality-sensitive hashing to quickly identify similar documents without explicit pairwise comparisons. The basic idea is that we'll return a set of keys, each corresponding to the hash of a subset of the signature.
class LSHMinhash(object):
""" This is a very basic implementation of minhash with locality-sensitive hashing """
def __init__(self, rows, bands):
rng = np.random.RandomState(seed=int.from_bytes(b"rad!", "big"))
hashes = rows * bands
self.rows = rows
self.bands = bands
self.buckets = np.full(hashes, (1 << 32) - 1)
self.hashes = [murmurmaker(seed) for seed in rng.randint(0, (1<<32) - 1, hashes)]
def add(self, obj):
self.buckets = np.minimum(self.buckets, [h(obj) for h in self.hashes])
def similarity(self, other):
""" """
return np.count_nonzero(self.buckets==other.buckets) / float(len(self.buckets))
def merge(self, other):
""" returns a newly-allocated minhash structure containing
the merge of this hash and another """
result = LSHMinhash(self.rows, self.bands)
numpy.minimum(self.buckets, other.buckets, result.buckets)
result.hashes = self.hashes
return result
def lsh_keys(self):
return [self.hashes[0]([b for b in band]) for band in self.buckets.copy().reshape((self.rows, self.bands))]
def test_lsh_minhash(count=50000, expected_percentage=.20):
m1 = LSHMinhash(64, 16)
m2 = LSHMinhash(64, 16)
for i in range(count):
bits = random.getrandbits(64).to_bytes(8, "big")
if i % 1000 < (1000 * expected_percentage):
m1.add(bits)
m2.add(bits)
elif i % 2 == 0:
m1.add(bits)
else:
m2.add(bits)
return (m1.similarity(m2), m1.lsh_keys(), m2.lsh_keys())
tup = test_lsh_minhash(expected_percentage=.95)
We can then group cells by keys (or even by parts of their keys) to identify candidate matches, which lets us only check a subset of all potential matches for similarity:
for t in zip(tup[1], tup[2]):
if t[0] == t[1]:
print(t)
def lsh_minhash_many(count=5000, expected_percentage = 0.2):
lsh_minhashes = [LSHMinhash(64, 16) for i in range(32)]
for i in range(count):
bits = random.getrandbits(64).to_bytes(8, "big")
if i % 1000 < (1000 * expected_percentage):
[lshm.add(bits) for lshm in lsh_minhashes]
else:
lsh_minhashes[i % 32].add(bits)
return lsh_minhashes
lshm_results = lsh_minhash_many(5000,0.5)
from collections import defaultdict
bands = [defaultdict(lambda: list()) for i in range(64)]
for ind, lshm in enumerate(lshm_results):
for idx, key in enumerate(lshm.lsh_keys()):
bands[idx][key % (1 << 14)].append(ind)
bands
To learn more about Minhash, locality-sensitive hashing, and similar techniques, see Chapter 3 of Mining of Massive Datasets by Leskovec, Rajaraman, and Ullman.