Given a number and a set of constant bins, what is the fastest way to do the binnning? Values outside of the bins should return nan
, otherwise they should be inclusive on the left bound.
def to_bins(x):
if x < 0.0:
return np.nan
if x < 1.3:
return 0
elif x < 1.35:
return 1
elif x < 2.4:
return 2
# And so on ...
elif x < 84.3:
return 100
else:
return np.nan
import numpy as np
def make_bins(n):
return np.cumsum(np.random.random(n))
def make_data(n, bins):
return np.random.uniform(bins.min() - 0.5, bins.max() + 0.5, n)
bins = make_bins(10)
data = make_data(10000, bins)
bins
array([ 0.1829594 , 0.53392777, 0.87212596, 0.87952459, 1.18258028, 1.84474015, 2.60266074, 2.84080941, 3.7679846 , 4.05534851])
data
array([ 0.72821738, 2.38961742, 3.10126157, ..., 4.17100479, 0.71565192, 2.08038063])
Implement a binary search using numpy.
def make_numpy(bins, otherwise):
def numpy(data):
return np.where((data < bins[0]) | (data >= bins[-1]), otherwise,
np.digitize(data, bins[1:]))
return numpy
numpy = make_numpy(bins, np.nan)
numpy(data)
array([ 1., 5., 7., ..., nan, 1., 5.])
Implement a linear search using numba.
import numba as nb
def make_linear(bins, otherwise):
@nb.vectorize('f8(f8)', nopython=True)
def linear_search(x):
if x < bins[0]:
return otherwise
for i in range(1, bins.shape[0]):
if x < bins[i]:
return i - 1
return otherwise
return linear_search
linear = make_linear(bins, np.nan)
linear(data)
array([ 1., 5., 7., ..., nan, 1., 5.])
Implement a binary search using numba.
def make_binary(bins, otherwise):
@nb.vectorize('f8(f8)', nopython=True)
def binary_search(x):
low = 0
high = len(bins)
mid = (low + high) // 2
while low < high:
if x < bins[mid]:
high = mid
else:
low = mid + 1
mid = (low + high) // 2
if mid == 0 or mid == len(bins):
return otherwise
return mid - 1
return binary_search
binary = make_binary(bins, np.nan)
binary(data)
array([ 1., 5., 7., ..., nan, 1., 5.])
Unroll a binary search into cascading if statements using string metaprogramming. Probably a bad idea.
import sys
from itertools import count
# In python 2, exec is a statement.
# This is needed for compatibility
if sys.version_info[0] == 3:
def _exec(codestr, glbls):
exec(codestr, glbls)
else:
eval(compile("""
def _exec(codestr, glbls):
exec codestr in glbls
""", "<_exec>", "exec"))
def make_meta(bins, otherwise, debug=False):
namespace = {}
_names = count()
# A function for creating symbols and adding them to the namespace.
def gsym(x):
s = "_%d" % next(_names)
namespace[s] = x
return s
body = _make_meta(bins, gsym(otherwise), 0, len(bins), gsym)
code = "def meta(x):\n{body}".format(body=indent(body))
if debug:
print(code)
_exec(code, namespace)
return nb.vectorize('f8(f8)', nopython=True)(namespace['meta'])
indent = lambda s: '\n'.join(' ' + l for l in s.splitlines())
def _make_meta(bins, otherwise, low, high, gsym):
mid = (high + low) // 2
if high > low:
return (
"if x < {bound}:\n"
"{left}\n"
"else:\n"
"{right}").format(bound=gsym(bins[mid]),
left=indent(_make_meta(bins, otherwise, low, mid, gsym)),
right=indent(_make_meta(bins, otherwise, mid + 1, high, gsym)))
elif mid == 0 or mid == len(bins):
return 'return {0}'.format(otherwise)
return 'return {0}'.format(gsym(mid - 1))
# Setting debug to True to print the generated code
meta = make_meta(bins, np.nan, debug=True)
def meta(x): if x < _1: if x < _2: if x < _3: if x < _4: return _0 else: return _5 else: return _6 else: if x < _7: if x < _8: return _9 else: return _10 else: return _11 else: if x < _12: if x < _13: if x < _14: return _15 else: return _16 else: return _17 else: if x < _18: return _19 else: return _0
meta(data)
array([ 1., 5., 7., ..., nan, 1., 5.])
for f in [linear, binary, meta]:
print(np.allclose(numpy(data), f(data), equal_nan=True))
True True True
Benchmark the various implementations using increasing number of bins.
from timeit import default_timer
def time(f, *args, **kwargs):
n = kwargs.get('n', 10)
start = default_timer()
for i in range(n):
f(*args)
end = default_timer()
return (end - start) / n
nbins = [5, 10, 50, 100, 250, 500, 1000]
numpy_times = []
linear_times = []
binary_times = []
meta_times = []
for b in nbins:
bins = make_bins(b)
data = make_data(1000000, bins)
# Build functions
numpy = make_numpy(bins, np.nan)
linear = make_linear(bins, np.nan)
binary = make_binary(bins, np.nan)
meta = make_meta(bins, np.nan)
# Precompile
linear(data)
binary(data)
meta(data)
# Timings
numpy_times.append(time(numpy, data))
linear_times.append(time(linear, data))
binary_times.append(time(binary, data))
meta_times.append(time(meta, data))
import bokeh.charts as bc
from bokeh.palettes import Spectral4
bc.output_notebook(hide_banner=True)
line = bc.Line(dict(nbins=nbins,
numpy=numpy_times,
linear=linear_times,
binary=binary_times,
meta=meta_times),
x='nbins', y=['numpy', 'linear', 'binary', 'meta'],
legend='top_left', palette=Spectral4,
title='Binning Times', ylabel='time (s)')
bc.show(line);