Notebook

In [1]:

%load_ext Cython

Data preparation¶

In [2]:

%%cython

import sys
from array import array
from bisect import bisect_left as bl


cdef int NONE_INT = -1
cdef str NONE_STR = "\x00"
cdef bytes NONE_BYTE = b"\x00"
cdef str UTF8 = "utf8"

INTS = dict(
    B=("char", 1, False),
    b=("signed char", 1, True),
    H=("unsigned short", 2, False),
    h=("short", 2, True),
    I=("unsigned int", 4, False),
    i=("int", 4, True),
    Q=("unsigned int", 8, False),
    q=("int", 8, True),
)


def makeBoundaries():
    minpoints = []
    maxpoints = []
    signedmaxpoints = []
    typeindex = {}
    for (tp, (name, nbytes, signed)) in INTS.items():
        topvalue = 256 ** nbytes
        if signed:
            minvalue = topvalue // 2
            maxvalue = topvalue // 2 - 1
        else:
            minvalue = 0
            maxvalue = topvalue - 1
        if signed:
            typeindex[minvalue] = tp
            minpoints.append(minvalue)
            signedmaxpoints.append(maxvalue)
        else:
            maxpoints.append(maxvalue)
        typeindex[maxvalue] = tp
    minpoints = sorted(minpoints)
    maxpoints = sorted(maxpoints)
    signedmaxpoints = sorted(signedmaxpoints)
    typerank = {
        tp: i
        for (i, tp) in enumerate(
            sorted((t for t in INTS if INTS[t][2]), key=lambda t: INTS[t][1])
        )
    }

    def getType(minv, maxv):
        if minv >= 0:
            pos = bl(maxpoints, maxv)
            return "" if pos >= len(maxpoints) else typeindex[maxpoints[pos]]
        posmin = bl(minpoints, -minv)
        if posmin >= len(minpoints):
            return ""
        if maxv <= 0:
            return typeindex[minpoints[posmin]]
        posmax = bl(signedmaxpoints, maxv)
        if posmax >= len(signedmaxpoints):
            return ""
        typemin = typeindex[minpoints[posmin]]
        typemax = typeindex[signedmaxpoints[posmax]]
        return typemin if typerank[typemin] > typerank[typemax] else typemax

    return getType


getIntSpec = makeBoundaries()


def makeData(items):
    if not items:
        return ((), (), (), b"")
    
    cdef list indices = []
    cdef list offsets = []
    cdef list bounds = []
    cdef list values = [NONE_BYTE]
    cdef dict valueMap = {NONE_STR: (0, 1)}
    cdef int curPos = 1
    cdef str prevV = NONE_STR
    cdef int prevI = -1

    def addValue(str v):
        nonlocal curPos

        if v in valueMap:
            (offset, bound) = valueMap[v]
            offsets.append(offset)
            bounds.append(bound)
        else:
            offsets.append(curPos)
            vb = v.encode(UTF8)
            bound = curPos + len(vb)
            bounds.append(bound)
            values.append(vb)
            valueMap[v] = (curPos, bound)
            curPos += len(vb)

    cdef int i
    cdef str v

    for (i, v) in items:
        if prevI == -1:
            indices.append(i)
            addValue(v)
        elif i != prevI + 1:
            indices.append(prevI + 1)
            addValue(NONE_STR)
            if v is not None:
                indices.append(i)
                addValue(v)
        elif prevV != v:
            indices.append(i)
            addValue(v)
        prevV = v
        prevI = i
    if prevV != NONE_STR:
        indices.append(prevI + 1)
        addValue(NONE_STR)

    offsets.append(curPos)

    valuesBin = b"".join(values)
    cdef str iTp = getIntSpec(min(indices), max(indices))
    indicesBin = array(iTp, indices)
    cdef str oTp = getIntSpec(min(offsets), max(offsets))
    offsetsBin = array(oTp, offsets)
    cdef str bTp = getIntSpec(min(bounds), max(bounds))
    boundsBin = array(bTp, bounds)
    return (indicesBin, offsetsBin, boundsBin, valuesBin)

Extension type for sparse data¶

In [3]:

%%cython

from array import array
from bisect import bisect_right as br

cdef str UTF8 = "utf8"

cdef class CSparseString:
    cdef indices
    cdef nIndices
    cdef offsets
    cdef bounds
    cdef values
    
    def __cinit__(self, indices, offsets, bounds, values):
        self.indices = indices
        self.nIndices = len(indices)
        self.offsets = offsets
        self.bounds = bounds
        self.values = values
        
    cpdef public str size(self):
        indices = self.indices
        offsets = self.offsets
        bounds = self.bounds
        values = self.values

        iSize = indices.itemsize * len(indices)
        oSize = offsets.itemsize * len(offsets)
        bSize = bounds.itemsize * len(bounds)
        vSize = len(values)
        return f"""{iSize + oSize + + bSize + vSize:>8}
\t{'indices':<8}: {indices.typecode}: {iSize:>8}
\t{'offsets':<8}: {offsets.typecode}: {oSize:>8}
\t{'bounds':<8}: {bounds.typecode}: {bSize:>8}
\t{'values':<8}: x: {vSize:>8}
"""

    cpdef public str get(self, int index):
        cdef int pos = br(self.indices, index)
        if pos == 0:
            return None
        pos = self.nIndices - 1 if pos > self.nIndices else pos - 1

        cdef int offset = self.offsets[pos]
        if offset == 0:
            return None
        cdef int bound = self.bounds[pos]
        cdef bytes rawValue = self.values[offset:bound]
        return rawValue.decode(UTF8)

Performance testing¶

In [4]:

import sys
from timeit import timeit
from tf.app import use
from pack import deepSize
from sparse import SparseString

In [5]:

%%cython

def test(mkData, store):
    items = [(5, "foo"), (10, "bar")]
    data = mkData(items)
    dataC = store(*data)

    for i in range(12):
        print(i, dataC.get(i))

In [6]:

test(makeData, CSparseString)

0 None
1 None
2 None
3 None
4 None
5 foo
6 None
7 None
8 None
9 None
10 bar
11 None

We test with a cythonized but unmodified text-fabric.

In [7]:

A = use('bhsa:clone', silent='deep')

TF-app: ~/github/annotation/app-bhsa/code

data: ~/text-fabric-data/etcbc/bhsa/tf/c

data: ~/text-fabric-data/etcbc/phono/tf/c

data: ~/text-fabric-data/etcbc/parallels/tf/c

In [8]:

%%cython

import sys
from timeit import timeit
from tf.app import use
from pack import deepSize
from sparse import SparseString

cdef void doall(v, int fI, int lI):
    cdef int n = 0
    cdef int i
    for i in range(fI - 10, lI + 10):
        if v(i) is not None:
            n += 1

def testPerformance(A, mkData, store, feat, check=True, withC=True):
    api = A.api
    maxNode = api.F.otype.maxNode
    Fs = api.Fs
    featObj = Fs(feat)
    items = sorted(featObj.items())
    firstI = items[0][0]
    halfI = items[len(items) // 2][0]
    lastI = items[-1][0]
    nonI = lastI + 100

    dataS = SparseString(items)
    if withC:
        dataCS = store(*mkData(items))

    tfLookup = featObj.v
    spLookup = dataS.get
    if withC:
        cspLookup = dataCS.get

    print(f"{len(items)} items")
    print(f"Some items:")
    print(f"\tfirst: {firstI:>7} = {tfLookup(firstI)}")
    print(f"\thalf : {halfI:>7} = {tfLookup(halfI)}")
    print(f"\tlast : {lastI:>7} = {tfLookup(lastI)}")
    print(f"\tnon  : {nonI:>7} = {tfLookup(nonI)}")
    print(f"Size (TF compiled) = {deepSize(featObj.data):>8}")
    print(f"Size (Sparse)      = {dataS.size()}")
    if withC:
        print(f"Size (CSparse)     = {dataCS.size()}")

    if check:
        print("checking correctness ...")
        errors = []
        for i in range(maxNode + 5):
            tv = tfLookup(i)
            sv = spLookup(i)
            if withC:
                csv = cspLookup(i)
            if (withC and (tv != sv or tv != csv)) or ((not withC) and tv != sv):
                errors.append(i)
        if errors:
            print(f"{len(errors)} errors")
            for i in errors[0:10]:
                print(
                    f'''{i:>7} TF: "{tfLookup(i)}" SP: "{spLookup(i)}" SCP: "{cspLookup(i)}"'''
                    if withC
                    else f'''{i:>7} TF: "{tfLookup(i)}" SP: "{spLookup(i)}"'''
                )
        else:
            print("all values correct")
    else:
        print("correctness not checked")

    def execute(method, v, doall):
        cdef int fI = firstI
        cdef int hI = halfI
        cdef int lI = lastI
        cdef int nI = nonI

        cdef int times1 = 1000000
        cdef int times2 = max((1, 1000000 // (lI - fI)))

        print(method)
        cdef str label
        cdef str code
        cdef int times
        for (label, code, times) in (
            ("first", "v(fI)", times1),
            ("half", "v(hI)", times1),
            ("last", "v(lI)", times1),
            ("non", "v(nI)", times1),
            ("all", "doall(v, fI, lI)", times2),
        ):
            sys.stdout.write(f"\t{label:<5} {times:>7}x")
            sys.stdout.flush()
            t = timeit(code, globals=locals(), number=times)
            sys.stdout.write(f"  {t:>.4f}\n")

    execute("TF", tfLookup, doall)
    execute("SP", spLookup, doall)
    if withC:
        execute("CSP", cspLookup, doall)

In [9]:

testPerformance(A, makeData, CSparseString, "nametype", withC=True)

38184 items
Some items:
	first:     740 = pers
	half :  210677 = topo
	last : 1446794 = pers
	non  : 1446894 = None
Size (TF compiled) =  2380461
Size (Sparse)      =   418826
	indices : I:   279176
	offsets : B:    69795
	bounds  : B:    69794
	values  : x:       61

Size (CSparse)     =   418826
	indices : I:   279176
	offsets : B:    69795
	bounds  : B:    69794
	values  : x:       61

checking correctness ...
all values correct
TF
	first 1000000x  0.1685
	half  1000000x  0.1748
	last  1000000x  0.1745
	non   1000000x  0.1114
	all         1x  0.1430
SP
	first 1000000x  1.2011
	half  1000000x  1.2664
	last  1000000x  1.2164
	non   1000000x  0.8938
	all         1x  1.3309
CSP
	first 1000000x  0.8623
	half  1000000x  0.9081
	last  1000000x  0.8581
	non   1000000x  0.7141
	all         1x  1.0002

In [27]:

testPerformance(A, makeData, CSparseString, "nametype", withC=True)

38184 items
Some items:
	first:     740 = pers
	half :  210677 = topo
	last : 1446794 = pers
	non  : 1446894 = None
Size (TF compiled) =  2380461
Size (Sparse)      =   418826
	indices : I:   279176
	offsets : B:    69795
	bounds  : B:    69794
	values  : x:       61

Size (CSparse)     =   418826
	indices : I:   279176
	offsets : B:    69795
	bounds  : B:    69794
	values  : x:       61

checking correctness ...
all values correct
TF
	first 1000000x  0.2191
	half  1000000x  0.2282
	last  1000000x  0.2226
	non   1000000x  0.1575
	all         1x  0.2152
SP
	first 1000000x  1.1921
	half  1000000x  1.2419
	last  1000000x  1.1768
	non   1000000x  0.8594
	all         1x  1.3096
CSP
	first 1000000x  0.8719
	half  1000000x  0.8880
	last  1000000x  0.8570
	non   1000000x  0.6908
	all         1x  1.0585

In [10]:

testPerformance(A, makeData, CSparseString, "otype", withC=True)

1446799 items
Some items:
	first:       1 = word
	half :  723400 = phrase
	last : 1446799 = lex
	non  : 1446899 = None
Size (TF compiled) =  8162441
Size (Sparse)      =      183
	indices : I:       56
	offsets : B:       15
	bounds  : B:       14
	values  : x:       98

Size (CSparse)     =      183
	indices : I:       56
	offsets : B:       15
	bounds  : B:       14
	values  : x:       98

checking correctness ...
all values correct
TF
	first 1000000x  0.1466
	half  1000000x  0.2728
	last  1000000x  0.2714
	non   1000000x  0.2211
	all         1x  0.2772
SP
	first 1000000x  0.8017
	half  1000000x  0.8326
	last  1000000x  0.8188
	non   1000000x  0.4878
	all         1x  1.1195
CSP
	first 1000000x  0.4530
	half  1000000x  0.4786
	last  1000000x  0.4703
	non   1000000x  0.2790
	all         1x  0.6361

In [11]:

testPerformance(A, makeData, CSparseString, "g_word_utf8", withC=True)

426584 items
Some items:
	first:       1 = בְּ
	half :  213293 = 
	last :  426584 = יָֽעַל
	non  :  426684 = None
Size (TF compiled) = 41387185
Size (Sparse)      =  6747070
	indices : I:  1706328
	offsets : I:  1706332
	bounds  : I:  1706328
	values  : x:  1628082

Size (CSparse)     =  6747070
	indices : I:  1706328
	offsets : I:  1706332
	bounds  : I:  1706328
	values  : x:  1628082

checking correctness ...
all values correct
TF
	first 1000000x  0.1497
	half  1000000x  0.1709
	last  1000000x  0.1690
	non   1000000x  0.1153
	all         2x  0.1243
SP
	first 1000000x  1.2320
	half  1000000x  1.2380
	last  1000000x  1.3823
	non   1000000x  0.9302
	all         2x  1.2855
CSP
	first 1000000x  0.8650
	half  1000000x  0.9073
	last  1000000x  1.0594
	non   1000000x  0.7417
	all         2x  0.9579

In [ ]: