import numpy as np
arr = np.arange(100_000)
rng1 = np.random.default_rng(123)
rng2 = np.random.default_rng(123)
def subset_with_boolean_mask_shuffle(mask_size):
mask = np.concatenate([np.ones(mask_size, dtype=bool), np.zeros(len(arr) - mask_size, dtype=bool)])
rng1.shuffle(mask)
return arr[mask]
def subset_with_int_mask_sort(mask_size):
m = rng2.choice(len(arr), replace=False, size=mask_size)
m.sort()
return arr[m]
subset_with_boolean_mask_shuffle(10)
array([21201, 22803, 24746, 31448, 44969, 56005, 60478, 73845, 89322, 99982])
subset_with_int_mask_sort(10)
array([ 1543, 5381, 17590, 18436, 22035, 25508, 33361, 59290, 68229, 90904])
# assert that the order is preserved
assert np.all((a := subset_with_boolean_mask_shuffle(100))[:-1] < a[1:])
assert np.all((a := subset_with_int_mask_sort(100))[:-1] < a[1:])
import perfplot
benchmarks = perfplot.bench(
setup=lambda n: n,
kernels=[
subset_with_boolean_mask_shuffle,
subset_with_int_mask_sort,
],
labels=[
"boolean array + shuffle",
"integer mask + sort",
],
n_range=range(1, len(arr), len(arr)//100),
xlabel="mask size",
equality_check=None,
)
benchmarks.show()