Conventional bloom filter with two hash function optimisation based on Less Hashing, Same Performance: Building a Better Bloom Filter Adam Kirsch, Michael Mitzenmacher
struct BloomFilter
bits::BitArray
k::Int
end
function BloomFilter(n, fpp)
k = -log(fpp)/log(2)
m = n*k/log(2)
BloomFilter(falses(Int(ceil(m))), Int(ceil(k)))
end
fpp(bf::BloomFilter) = .5^bf.k
function Base.show(io::IO, bf::BloomFilter)
print(io, "BloomFilter: ~", Int(ceil(length(bf.bits)/8/1024)), " KiB, ", bf.k, " hashes, fpp=", fpp(bf))
end
@time bf = BloomFilter(10_000_000, 0.01)
0.004836 seconds (4 allocations: 11.426 MiB)
BloomFilter: ~11701 KiB, 7 hashes, fpp=0.0078125
function add!(bf::BloomFilter, s)
m=length(bf.bits)
hash1 = hash(s, UInt(9))
hash2 = hash(s, UInt(17))
for i in 1:bf.k
bf.bits[(hash1+i*hash2)%m] = 1
end
end
add!(bf, "hello")
function in(s, bf::BloomFilter)
m=length(bf.bits)
hash1 = hash(s, UInt(9))
hash2 = hash(s, UInt(17))
!any(b->b, (!bf.bits[(hash1+i*hash2)%m] for i in 1:bf.k))
end
@assert "hello" in bf
@assert !("not hello" in bf)
@assert !("hello not" in bf)
See Analysis of Counting Bloom Filters Used for Count Thresholding - by Kibeom Kim, Yongjo Jeong, Youngjoo Lee and Sunggu Lee
struct CountingBloomFilter{T <: Integer}
counters::Array{T}
k::Int
end
function CountingBloomFilter{T}(n::Int, fpp::Real) where T<:Integer
k = -log(fpp)/log(2) # See paper for a better way of estimating optimal value
m = n*k/log(2)
CountingBloomFilter(zeros(T, Int(ceil(m))), Int(ceil(k)))
end
fpp(cbf::CountingBloomFilter) = .5^cbf.k
function Base.show(io::IO, bf::CountingBloomFilter{T}) where T
print(io, "CountingBloomFilter: ~", Int(ceil(length(bf.counters)*sizeof(T)/1024)), " KiB, ", bf.k, " hashes, fpp=", fpp(bf))
end
@time nsbf = CountingBloomFilter{Int16}(1_000_000, 0.01)
0.007179 seconds (3 allocations: 18.282 MiB)
CountingBloomFilter: ~18721 KiB, 7 hashes, fpp=0.0078125
function add!(bf::CountingBloomFilter, s)
m=length(bf.counters)
hash1 = hash(s, UInt(9))
hash2 = hash(s, UInt(17))
for i in 1:bf.k
bf.counters[(hash1+i*hash2)%m] += 1
end
end
add!(nsbf, "hello")
add!(nsbf, "hello")
function count(bf::CountingBloomFilter, s)
m=length(bf.counters)
hash1 = hash(s, UInt(9))
hash2 = hash(s, UInt(17))
minimum(bf.counters[(hash1+i*hash2)%m] for i in 1:bf.k)
end
@assert count(nsbf, "hello") == 2
@assert count(nsbf, "hellooo") == 0
add!(nsbf, "hello")
@assert count(nsbf, "hello") == 3
See Spectral Bloom Filters Saar Cohen, Yossi Matias and MSc thesis for fuller treatment.
Based on Dynamic Count Filters - J. Aguilar-Saborit, P. Trancoso, V. Muntes-Mulero, J.L. Larriba-Pey
(the below is not fully complete and could be tidyied up considerably but it shows the idea)
# Increment a range inside a bit array by one, returns carry bit
function incr!(ba::BitArray, r::UnitRange)
carry = true
i = r.start
while carry && i < r.stop
if ba[i] == 1
ba[i] = 0
else
ba[i] = 1
carry = false
end
i += 1
end
carry
end
# Returns the value of a range inside a bit array
function value(ba::BitArray, r::UnitRange)
bar = ba[r]
length(bar) >0 ? sum(bar[i+1]*(2^i) for i in 0:length(bar)-1) : 0
end
testArray = falses(10)
@assert value(testArray, 3:5) == 0
@assert incr!(testArray, 3:5) == false
@assert value(testArray, 3:5) == 1
@assert incr!(testArray, 3:5) == false
@assert value(testArray, 3:5) == 2
@assert incr!(testArray, 3:5) == false
@assert value(testArray, 3:5) == 3
@assert incr!(testArray, 3:5) == true
@assert value(testArray, 3:5) == 0
@assert incr!(testArray, 3:5) == false
@assert value(testArray, 3:5) == 1
mutable struct DynamicCountFilter{T <: Integer}
CBFV::Array{T}
OFV::BitArray
k::Int
y::Int
end
function DynamicCountFilter{T}(n::Int, fpp::Real) where T<:Integer
k = -log(fpp)/log(2)
m = Int(ceil(n*k/log(2)))
y = 0
DynamicCountFilter(zeros(T, m), falses(y*m), Int(ceil(k)), y)
end
fpp(dcf::DynamicCountFilter) = .5^dcf.k
function Base.show(io::IO, dcf::DynamicCountFilter{T}) where T
cbfv_bytes = length(dcf.CBFV)*sizeof(T)
ofv_bytes = length(dcf.OFV)/8
total_mem_kb = Int(ceil((cbfv_bytes+ofv_bytes)/1024))
print(io, "DynamicCountFilter: ≈", total_mem_kb, " KiB, ", dcf.k, " hashes, fpp=", fpp(dcf))
end
@time dcf = DynamicCountFilter{UInt8}(5, 0.01)
0.000002 seconds (4 allocations: 304 bytes)
DynamicCountFilter: ≈1 KiB, 7 hashes, fpp=0.0078125
function add!(dcf::DynamicCountFilter{T}, s) where T
m=length(dcf.CBFV)
hash1 = hash(s, UInt(9))
hash2 = hash(s, UInt(17))
for i in 1:dcf.k
j = (hash1+i*hash2)%m
if dcf.CBFV[j+1] == typemax(T)
y = dcf.y
carry = incr!(dcf.OFV, (y*j)+1:y*(j+1))
if (carry)
# Rebuild
ny = y+1
newOFV = falses(m*ny)
for l in 0:m-1
newOFV[(ny*l)+1:(ny*l)+y] = dcf.OFV[(y*l)+1:y*(l+1)]
end
dcf.OFV = newOFV
dcf.y = ny
# set new carry bit
dcf.OFV[ny*(j+1)] = 1
dcf.CBFV[j+1] = 0
end
else
dcf.CBFV[j+1] += 1
end
end
end
@show dcf.OFV, dcf.CBFV;
(dcf.OFV, dcf.CBFV) = (Bool[], UInt8[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00])
function count(dcf::DynamicCountFilter, s)
m=length(dcf.CBFV)
hash1 = hash(s, UInt(9))
hash2 = hash(s, UInt(17))
mₓ=typemax(Int64)
for i in 1:dcf.k
b = (hash1+i*hash2)%m
cₓ = (value(dcf.OFV, (dcf.y*b)+1:dcf.y*(b+1))<<8) + dcf.CBFV[b]
if cₓ < mₓ
mₓ = cₓ
end
end
mₓ
end
count(dcf, "hello")
0
for i in 1:256
add!(dcf, "hello")
end
@show dcf.OFV, dcf.CBFV;
(dcf.OFV, dcf.CBFV) = (Bool[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0], UInt8[0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff])
count(dcf, "hello")
256