from bisect import bisect_left
from timeit import timeit
from tf.app import use
from pack import deepSize
chapters = [[1, 1, 10], [2, 11, 20], [3, 21, 30]]
lastMs = []
ids = []
for record in chapters:
lastMs.append(record[2])
ids.append(record[0])
for i in range(1, 35):
ind = bisect_left(lastMs, i)
cid = ids[ind] if ind < len(ids) else "x"
print(f"{i:>2} => {ind:>2} => {cid:>2}")
1 => 0 => 1 2 => 0 => 1 3 => 0 => 1 4 => 0 => 1 5 => 0 => 1 6 => 0 => 1 7 => 0 => 1 8 => 0 => 1 9 => 0 => 1 10 => 0 => 1 11 => 1 => 2 12 => 1 => 2 13 => 1 => 2 14 => 1 => 2 15 => 1 => 2 16 => 1 => 2 17 => 1 => 2 18 => 1 => 2 19 => 1 => 2 20 => 1 => 2 21 => 2 => 3 22 => 2 => 3 23 => 2 => 3 24 => 2 => 3 25 => 2 => 3 26 => 2 => 3 27 => 2 => 3 28 => 2 => 3 29 => 2 => 3 30 => 2 => 3 31 => 3 => x 32 => 3 => x 33 => 3 => x 34 => 3 => x
lastMs
[10, 20, 30]
A = use('bhsa:clone', silent='deep')
Given a list of valid indices and a list of all values, we can look up all values by means of bisect.
The get function is surprisingly simple and quite fast.
# nametype
# sp
def testPerformance(feat):
Fs = A.api.Fs
featObj = Fs(feat)
data = featObj.data
indices = []
values = []
for (k, v) in sorted(data.items()):
indices.append(k)
values.append(v)
# print(indices[0:10])
# print(values[0:10])
indSize = deepSize(indices)
valSize = deepSize(values)
totSize = indSize + valSize
print(f"{len(data)} items")
print(f"Size = {deepSize(data)}")
print(f"Size = {indSize} + {valSize} = {totSize}")
tfLookup = featObj.v
def bsLookup(i):
j = bisect_left(indices, i)
if j >= len(indices):
return None
k = indices[j]
return values[j] if k == i else None
maxIndex = max(indices)
def execute(v):
upperIndex = maxIndex + 10
key0 = 739 # not in the data
key1 = 740 # in the data
def w():
for i in range(700, 1700):
x = v(i)
def a():
n = 0
for i in range(upperIndex):
if v(i) is not None:
n += 1
times1 = 1000000
times2 = 1000
times3 = 1
t1 = timeit("v(key0)", globals=locals(), number=times1)
t2 = timeit("v(key1)", globals=locals(), number=times1)
t3 = timeit("w()", globals=locals(), number=times2)
t4 = timeit("a()", globals=locals(), number=times3)
print(f"{t1:>.3f} {t2:>.3f} {t3:>.3f} {t4 * 1000000 / upperIndex:>.3f}")
execute(tfLookup)
execute(bsLookup)
testPerformance('nametype')
38184 items Size = 2380461 Size = 1390248 + 321597 = 1711845 0.154 0.216 0.159 0.164 0.523 0.537 0.512 0.515
testPerformance('sp')
435817 items Size = 33175225 Size = 16016172 + 3814037 = 19830209 0.210 0.214 0.210 0.186 0.637 0.591 0.618 0.634
The performance degradation is a factor of 3-4, but no more memory is used (rather a bit less). More over, instead of a dict we have two lists, which we can manage in a separate process by means of SharableList.