In [1]:

from bisect import bisect_left
from timeit import timeit
from tf.app import use
from pack import deepSize

In [5]:

chapters = [[1, 1, 10], [2, 11, 20], [3, 21, 30]]

In [22]:

lastMs = []
ids = []

for record in chapters:
    lastMs.append(record[2])
    ids.append(record[0])

In [30]:

for i in range(1, 35):
    ind = bisect_left(lastMs, i)
    cid = ids[ind] if ind < len(ids) else "x"
    print(f"{i:>2} => {ind:>2} => {cid:>2}")

 1 =>  0 =>  1
 2 =>  0 =>  1
 3 =>  0 =>  1
 4 =>  0 =>  1
 5 =>  0 =>  1
 6 =>  0 =>  1
 7 =>  0 =>  1
 8 =>  0 =>  1
 9 =>  0 =>  1
10 =>  0 =>  1
11 =>  1 =>  2
12 =>  1 =>  2
13 =>  1 =>  2
14 =>  1 =>  2
15 =>  1 =>  2
16 =>  1 =>  2
17 =>  1 =>  2
18 =>  1 =>  2
19 =>  1 =>  2
20 =>  1 =>  2
21 =>  2 =>  3
22 =>  2 =>  3
23 =>  2 =>  3
24 =>  2 =>  3
25 =>  2 =>  3
26 =>  2 =>  3
27 =>  2 =>  3
28 =>  2 =>  3
29 =>  2 =>  3
30 =>  2 =>  3
31 =>  3 =>  x
32 =>  3 =>  x
33 =>  3 =>  x
34 =>  3 =>  x

In [25]:

lastMs

Out[25]:

[10, 20, 30]

In [2]:

A = use('bhsa:clone', silent='deep')

TF-app: ~/github/annotation/app-bhsa/code

data: ~/text-fabric-data/etcbc/bhsa/tf/c

data: ~/text-fabric-data/etcbc/phono/tf/c

data: ~/text-fabric-data/etcbc/parallels/tf/c

Given a list of valid indices and a list of all values, we can look up all values by means of bisect.

The get function is surprisingly simple and quite fast.

In [5]:

# nametype
# sp

def testPerformance(feat):
    Fs = A.api.Fs
    featObj = Fs(feat)
    data = featObj.data
    
    indices = []
    values = []
    for (k, v) in sorted(data.items()):
        indices.append(k)
        values.append(v)
    # print(indices[0:10])
    # print(values[0:10])

    indSize = deepSize(indices)
    valSize = deepSize(values)
    totSize = indSize + valSize

    print(f"{len(data)} items")
    print(f"Size = {deepSize(data)}")
    print(f"Size = {indSize} + {valSize} = {totSize}")
    
    tfLookup = featObj.v
    
    def bsLookup(i):
        j = bisect_left(indices, i)
        if j >= len(indices):
            return None
        k = indices[j]
        return values[j] if k == i else None
    
    maxIndex = max(indices)
    
    def execute(v):
        upperIndex = maxIndex + 10
        key0 = 739 # not in the data
        key1 = 740 # in the data
        
        def w():
            for i in range(700, 1700):
                x = v(i)
            
        def a():
            n = 0
            for i in range(upperIndex):
                if v(i) is not None:
                    n += 1
            
        times1 = 1000000
        times2 = 1000
        times3 = 1
        
        t1 = timeit("v(key0)", globals=locals(), number=times1)
        t2 = timeit("v(key1)", globals=locals(), number=times1)
        t3 = timeit("w()", globals=locals(), number=times2)
        t4 = timeit("a()", globals=locals(), number=times3)
        print(f"{t1:>.3f} {t2:>.3f} {t3:>.3f} {t4 * 1000000 / upperIndex:>.3f}")
    
    execute(tfLookup)
    execute(bsLookup)

In [6]:

testPerformance('nametype')

38184 items
Size = 2380461
Size = 1390248 + 321597 = 1711845
0.154 0.216 0.159 0.164
0.523 0.537 0.512 0.515

In [7]:

testPerformance('sp')

435817 items
Size = 33175225
Size = 16016172 + 3814037 = 19830209
0.210 0.214 0.210 0.186
0.637 0.591 0.618 0.634

Observation¶

The performance degradation is a factor of 3-4, but no more memory is used (rather a bit less). More over, instead of a dict we have two lists, which we can manage in a separate process by means of SharableList.