%load_ext Cython
%%cython
import sys
from array import array
from bisect import bisect_left as bl
cdef int NONE_INT = -1
cdef str NONE_STR = "\x00"
cdef bytes NONE_BYTE = b"\x00"
cdef str UTF8 = "utf8"
INTS = dict(
B=("char", 1, False),
b=("signed char", 1, True),
H=("unsigned short", 2, False),
h=("short", 2, True),
I=("unsigned int", 4, False),
i=("int", 4, True),
Q=("unsigned int", 8, False),
q=("int", 8, True),
)
def makeBoundaries():
minpoints = []
maxpoints = []
signedmaxpoints = []
typeindex = {}
for (tp, (name, nbytes, signed)) in INTS.items():
topvalue = 256 ** nbytes
if signed:
minvalue = topvalue // 2
maxvalue = topvalue // 2 - 1
else:
minvalue = 0
maxvalue = topvalue - 1
if signed:
typeindex[minvalue] = tp
minpoints.append(minvalue)
signedmaxpoints.append(maxvalue)
else:
maxpoints.append(maxvalue)
typeindex[maxvalue] = tp
minpoints = sorted(minpoints)
maxpoints = sorted(maxpoints)
signedmaxpoints = sorted(signedmaxpoints)
typerank = {
tp: i
for (i, tp) in enumerate(
sorted((t for t in INTS if INTS[t][2]), key=lambda t: INTS[t][1])
)
}
def getType(minv, maxv):
if minv >= 0:
pos = bl(maxpoints, maxv)
return "" if pos >= len(maxpoints) else typeindex[maxpoints[pos]]
posmin = bl(minpoints, -minv)
if posmin >= len(minpoints):
return ""
if maxv <= 0:
return typeindex[minpoints[posmin]]
posmax = bl(signedmaxpoints, maxv)
if posmax >= len(signedmaxpoints):
return ""
typemin = typeindex[minpoints[posmin]]
typemax = typeindex[signedmaxpoints[posmax]]
return typemin if typerank[typemin] > typerank[typemax] else typemax
return getType
getIntSpec = makeBoundaries()
def makeData(items):
if not items:
return ((), (), (), b"")
cdef list indices = []
cdef list offsets = []
cdef list bounds = []
cdef list values = [NONE_BYTE]
cdef dict valueMap = {NONE_STR: (0, 1)}
cdef int curPos = 1
cdef str prevV = NONE_STR
cdef int prevI = -1
def addValue(str v):
nonlocal curPos
if v in valueMap:
(offset, bound) = valueMap[v]
offsets.append(offset)
bounds.append(bound)
else:
offsets.append(curPos)
vb = v.encode(UTF8)
bound = curPos + len(vb)
bounds.append(bound)
values.append(vb)
valueMap[v] = (curPos, bound)
curPos += len(vb)
cdef int i
cdef str v
for (i, v) in items:
if prevI == -1:
indices.append(i)
addValue(v)
elif i != prevI + 1:
indices.append(prevI + 1)
addValue(NONE_STR)
if v is not None:
indices.append(i)
addValue(v)
elif prevV != v:
indices.append(i)
addValue(v)
prevV = v
prevI = i
if prevV != NONE_STR:
indices.append(prevI + 1)
addValue(NONE_STR)
offsets.append(curPos)
valuesBin = b"".join(values)
cdef str iTp = getIntSpec(min(indices), max(indices))
indicesBin = array(iTp, indices)
cdef str oTp = getIntSpec(min(offsets), max(offsets))
offsetsBin = array(oTp, offsets)
cdef str bTp = getIntSpec(min(bounds), max(bounds))
boundsBin = array(bTp, bounds)
return (indicesBin, offsetsBin, boundsBin, valuesBin)
%%cython
from array import array
from bisect import bisect_right as br
cdef str UTF8 = "utf8"
cdef class CSparseString:
cdef indices
cdef nIndices
cdef offsets
cdef bounds
cdef values
def __cinit__(self, indices, offsets, bounds, values):
self.indices = indices
self.nIndices = len(indices)
self.offsets = offsets
self.bounds = bounds
self.values = values
cpdef public str size(self):
indices = self.indices
offsets = self.offsets
bounds = self.bounds
values = self.values
iSize = indices.itemsize * len(indices)
oSize = offsets.itemsize * len(offsets)
bSize = bounds.itemsize * len(bounds)
vSize = len(values)
return f"""{iSize + oSize + + bSize + vSize:>8}
\t{'indices':<8}: {indices.typecode}: {iSize:>8}
\t{'offsets':<8}: {offsets.typecode}: {oSize:>8}
\t{'bounds':<8}: {bounds.typecode}: {bSize:>8}
\t{'values':<8}: x: {vSize:>8}
"""
cpdef public str get(self, int index):
cdef int pos = br(self.indices, index)
if pos == 0:
return None
pos = self.nIndices - 1 if pos > self.nIndices else pos - 1
cdef int offset = self.offsets[pos]
if offset == 0:
return None
cdef int bound = self.bounds[pos]
cdef bytes rawValue = self.values[offset:bound]
return rawValue.decode(UTF8)
import sys
from timeit import timeit
from tf.app import use
from pack import deepSize
from sparse import SparseString
%%cython
def test(mkData, store):
items = [(5, "foo"), (10, "bar")]
data = mkData(items)
dataC = store(*data)
for i in range(12):
print(i, dataC.get(i))
test(makeData, CSparseString)
0 None 1 None 2 None 3 None 4 None 5 foo 6 None 7 None 8 None 9 None 10 bar 11 None
We test with a cythonized but unmodified text-fabric.
A = use('bhsa:clone', silent='deep')
%%cython
import sys
from timeit import timeit
from tf.app import use
from pack import deepSize
from sparse import SparseString
cdef void doall(v, int fI, int lI):
cdef int n = 0
cdef int i
for i in range(fI - 10, lI + 10):
if v(i) is not None:
n += 1
def testPerformance(A, mkData, store, feat, check=True, withC=True):
api = A.api
maxNode = api.F.otype.maxNode
Fs = api.Fs
featObj = Fs(feat)
items = sorted(featObj.items())
firstI = items[0][0]
halfI = items[len(items) // 2][0]
lastI = items[-1][0]
nonI = lastI + 100
dataS = SparseString(items)
if withC:
dataCS = store(*mkData(items))
tfLookup = featObj.v
spLookup = dataS.get
if withC:
cspLookup = dataCS.get
print(f"{len(items)} items")
print(f"Some items:")
print(f"\tfirst: {firstI:>7} = {tfLookup(firstI)}")
print(f"\thalf : {halfI:>7} = {tfLookup(halfI)}")
print(f"\tlast : {lastI:>7} = {tfLookup(lastI)}")
print(f"\tnon : {nonI:>7} = {tfLookup(nonI)}")
print(f"Size (TF compiled) = {deepSize(featObj.data):>8}")
print(f"Size (Sparse) = {dataS.size()}")
if withC:
print(f"Size (CSparse) = {dataCS.size()}")
if check:
print("checking correctness ...")
errors = []
for i in range(maxNode + 5):
tv = tfLookup(i)
sv = spLookup(i)
if withC:
csv = cspLookup(i)
if (withC and (tv != sv or tv != csv)) or ((not withC) and tv != sv):
errors.append(i)
if errors:
print(f"{len(errors)} errors")
for i in errors[0:10]:
print(
f'''{i:>7} TF: "{tfLookup(i)}" SP: "{spLookup(i)}" SCP: "{cspLookup(i)}"'''
if withC
else f'''{i:>7} TF: "{tfLookup(i)}" SP: "{spLookup(i)}"'''
)
else:
print("all values correct")
else:
print("correctness not checked")
def execute(method, v, doall):
cdef int fI = firstI
cdef int hI = halfI
cdef int lI = lastI
cdef int nI = nonI
cdef int times1 = 1000000
cdef int times2 = max((1, 1000000 // (lI - fI)))
print(method)
cdef str label
cdef str code
cdef int times
for (label, code, times) in (
("first", "v(fI)", times1),
("half", "v(hI)", times1),
("last", "v(lI)", times1),
("non", "v(nI)", times1),
("all", "doall(v, fI, lI)", times2),
):
sys.stdout.write(f"\t{label:<5} {times:>7}x")
sys.stdout.flush()
t = timeit(code, globals=locals(), number=times)
sys.stdout.write(f" {t:>.4f}\n")
execute("TF", tfLookup, doall)
execute("SP", spLookup, doall)
if withC:
execute("CSP", cspLookup, doall)
testPerformance(A, makeData, CSparseString, "nametype", withC=True)
38184 items Some items: first: 740 = pers half : 210677 = topo last : 1446794 = pers non : 1446894 = None Size (TF compiled) = 2380461 Size (Sparse) = 418826 indices : I: 279176 offsets : B: 69795 bounds : B: 69794 values : x: 61 Size (CSparse) = 418826 indices : I: 279176 offsets : B: 69795 bounds : B: 69794 values : x: 61 checking correctness ... all values correct TF first 1000000x 0.1685 half 1000000x 0.1748 last 1000000x 0.1745 non 1000000x 0.1114 all 1x 0.1430 SP first 1000000x 1.2011 half 1000000x 1.2664 last 1000000x 1.2164 non 1000000x 0.8938 all 1x 1.3309 CSP first 1000000x 0.8623 half 1000000x 0.9081 last 1000000x 0.8581 non 1000000x 0.7141 all 1x 1.0002
testPerformance(A, makeData, CSparseString, "nametype", withC=True)
38184 items Some items: first: 740 = pers half : 210677 = topo last : 1446794 = pers non : 1446894 = None Size (TF compiled) = 2380461 Size (Sparse) = 418826 indices : I: 279176 offsets : B: 69795 bounds : B: 69794 values : x: 61 Size (CSparse) = 418826 indices : I: 279176 offsets : B: 69795 bounds : B: 69794 values : x: 61 checking correctness ... all values correct TF first 1000000x 0.2191 half 1000000x 0.2282 last 1000000x 0.2226 non 1000000x 0.1575 all 1x 0.2152 SP first 1000000x 1.1921 half 1000000x 1.2419 last 1000000x 1.1768 non 1000000x 0.8594 all 1x 1.3096 CSP first 1000000x 0.8719 half 1000000x 0.8880 last 1000000x 0.8570 non 1000000x 0.6908 all 1x 1.0585
testPerformance(A, makeData, CSparseString, "otype", withC=True)
1446799 items Some items: first: 1 = word half : 723400 = phrase last : 1446799 = lex non : 1446899 = None Size (TF compiled) = 8162441 Size (Sparse) = 183 indices : I: 56 offsets : B: 15 bounds : B: 14 values : x: 98 Size (CSparse) = 183 indices : I: 56 offsets : B: 15 bounds : B: 14 values : x: 98 checking correctness ... all values correct TF first 1000000x 0.1466 half 1000000x 0.2728 last 1000000x 0.2714 non 1000000x 0.2211 all 1x 0.2772 SP first 1000000x 0.8017 half 1000000x 0.8326 last 1000000x 0.8188 non 1000000x 0.4878 all 1x 1.1195 CSP first 1000000x 0.4530 half 1000000x 0.4786 last 1000000x 0.4703 non 1000000x 0.2790 all 1x 0.6361
testPerformance(A, makeData, CSparseString, "g_word_utf8", withC=True)
426584 items Some items: first: 1 = בְּ half : 213293 = last : 426584 = יָֽעַל non : 426684 = None Size (TF compiled) = 41387185 Size (Sparse) = 6747070 indices : I: 1706328 offsets : I: 1706332 bounds : I: 1706328 values : x: 1628082 Size (CSparse) = 6747070 indices : I: 1706328 offsets : I: 1706332 bounds : I: 1706328 values : x: 1628082 checking correctness ... all values correct TF first 1000000x 0.1497 half 1000000x 0.1709 last 1000000x 0.1690 non 1000000x 0.1153 all 2x 0.1243 SP first 1000000x 1.2320 half 1000000x 1.2380 last 1000000x 1.3823 non 1000000x 0.9302 all 2x 1.2855 CSP first 1000000x 0.8650 half 1000000x 0.9073 last 1000000x 1.0594 non 1000000x 0.7417 all 2x 0.9579