This iPython notebook contains the examples detailed in my post 🚀 100 Times Faster Natural Language Processing in Python.
To run the notebook, you will first need to:
pip install cython
pip install spacy
python -m spacy download en
Cython then has to be activated in the notebook as follows:
%load_ext Cython
In this simple example we have a large set of rectangles that we store as a list of Python objects, e.g. instances of a Rectangle class. The main job of our module is to iterate over this list in order to count how many rectangles have an area larger than a specific threshold.
Our Python module is quite simple and looks like this (see also here: https://gist.github.com/thomwolf/0709b5a72cf3620cd00d94791213d38e):
from random import random
class Rectangle:
def __init__(self, w, h):
self.w = w
self.h = h
def area(self):
return self.w * self.h
def check_rectangles_py(rectangles, threshold):
n_out = 0
for rectangle in rectangles:
if rectangle.area() > threshold:
n_out += 1
return n_out
def main_rectangles_slow():
n_rectangles = 10000000
rectangles = list(Rectangle(random(), random()) for i in range(n_rectangles))
n_out = check_rectangles_py(rectangles, threshold=0.25)
print(n_out)
%%time
# Let's run it:
main_rectangles_slow()
4034940 CPU times: user 13.3 s, sys: 1.48 s, total: 14.8 s Wall time: 14.8 s
The check_rectangles
function which loops over a large number of Python objects is our bottleneck!
Let's write it in Cython.
We indicate the cell is a Cython cell by using the %%cython
magic command. We the cell is run, the cython code will be written in a temporary file, compiled and reimported in the iPython space. The Cython code thus have to be somehow self contained.
%%cython
from cymem.cymem cimport Pool
from random import random
cdef struct Rectangle:
float w
float h
cdef int check_rectangles_cy(Rectangle* rectangles, int n_rectangles, float threshold):
cdef int n_out = 0
# C arrays contain no size information => we need to state it explicitly
for rectangle in rectangles[:n_rectangles]:
if rectangle.w * rectangle.h > threshold:
n_out += 1
return n_out
def main_rectangles_fast():
cdef int n_rectangles = 10000000
cdef float threshold = 0.25
cdef Pool mem = Pool()
cdef Rectangle* rectangles = <Rectangle*>mem.alloc(n_rectangles, sizeof(Rectangle))
for i in range(n_rectangles):
rectangles[i].w = random()
rectangles[i].h = random()
n_out = check_rectangles_cy(rectangles, n_rectangles, threshold)
print(n_out)
%%time
main_rectangles_fast()
4036387 CPU times: user 676 ms, sys: 40.8 ms, total: 717 ms Wall time: 715 ms
In this simple case we are about 20 times faster in Cython.
The ratio of improvement depends a lot on the specific syntax of the Python program.
While the speed in Cython is rather predictible once your code make only use of C level objects (it is usually directly the fastest possible speed), the speed of Python can vary a lot depending on how your program is written and how much overhead the interpreter will add.
How can you be sure you Cython program makes only use of C level structures?
Use the -a
or --annotate
flag in the %%cython
magic command to display a code analysis with the line accessing and using Python objects highlighted in yellow.
Here is how our the code analysis of previous program looks:
%%cython -a
from cymem.cymem cimport Pool
from random import random
cdef struct Rectangle:
float w
float h
cdef int check_rectangles_cy(Rectangle* rectangles, int n_rectangles, float threshold):
cdef int n_out = 0
# C arrays contain no size information => we need to state it explicitly
for rectangle in rectangles[:n_rectangles]:
if rectangle.w * rectangle.h > threshold:
n_out += 1
return n_out
cpdef main_rectangles_fast():
cdef int n_rectangles = 10000000
cdef float threshold = 0.25
cdef Pool mem = Pool()
cdef Rectangle* rectangles = <Rectangle*>mem.alloc(n_rectangles, sizeof(Rectangle))
for i in range(n_rectangles):
rectangles[i].w = random()
rectangles[i].h = random()
n_out = check_rectangles_cy(rectangles, n_rectangles, threshold)
print(n_out)
Generated by Cython 0.28.3
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01: from cymem.cymem cimport Pool
+02: from random import random
__pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(__pyx_n_s_random); __Pyx_GIVEREF(__pyx_n_s_random); PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_random); __pyx_t_2 = __Pyx_Import(__pyx_n_s_random, __pyx_t_1, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_random); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_random, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
03:
+04: cdef struct Rectangle:
struct __pyx_t_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_Rectangle { float w; float h; };
05: float w
06: float h
07:
+08: cdef int check_rectangles_cy(Rectangle* rectangles, int n_rectangles, float threshold):
static int __pyx_f_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_check_rectangles_cy(struct __pyx_t_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_Rectangle *__pyx_v_rectangles, int __pyx_v_n_rectangles, float __pyx_v_threshold) { int __pyx_v_n_out; struct __pyx_t_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_Rectangle __pyx_v_rectangle; int __pyx_r; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("check_rectangles_cy", 0); /* … */ /* function exit code */ __pyx_L0:; __Pyx_RefNannyFinishContext(); return __pyx_r; }
+09: cdef int n_out = 0
__pyx_v_n_out = 0;
10: # C arrays contain no size information => we need to state it explicitly
+11: for rectangle in rectangles[:n_rectangles]:
__pyx_t_2 = (__pyx_v_rectangles + __pyx_v_n_rectangles); for (__pyx_t_3 = __pyx_v_rectangles; __pyx_t_3 < __pyx_t_2; __pyx_t_3++) { __pyx_t_1 = __pyx_t_3; __pyx_v_rectangle = (__pyx_t_1[0]);
+12: if rectangle.w * rectangle.h > threshold:
__pyx_t_4 = (((__pyx_v_rectangle.w * __pyx_v_rectangle.h) > __pyx_v_threshold) != 0); if (__pyx_t_4) { /* … */ } }
+13: n_out += 1
__pyx_v_n_out = (__pyx_v_n_out + 1);
+14: return n_out
__pyx_r = __pyx_v_n_out; goto __pyx_L0;
15:
+16: cpdef main_rectangles_fast():
static PyObject *__pyx_pw_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_1main_rectangles_fast(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused); /*proto*/ static PyObject *__pyx_f_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_main_rectangles_fast(CYTHON_UNUSED int __pyx_skip_dispatch) { int __pyx_v_n_rectangles; float __pyx_v_threshold; struct __pyx_obj_5cymem_5cymem_Pool *__pyx_v_mem = 0; struct __pyx_t_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_Rectangle *__pyx_v_rectangles; int __pyx_v_i; int __pyx_v_n_out; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("main_rectangles_fast", 0); /* … */ /* function exit code */ __pyx_r = Py_None; __Pyx_INCREF(Py_None); goto __pyx_L0; __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_6); __Pyx_XDECREF(__pyx_t_7); __Pyx_AddTraceback("_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce.main_rectangles_fast", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = 0; __pyx_L0:; __Pyx_XDECREF((PyObject *)__pyx_v_mem); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_1main_rectangles_fast(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused); /*proto*/ static PyObject *__pyx_pw_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_1main_rectangles_fast(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("main_rectangles_fast (wrapper)", 0); __pyx_r = __pyx_pf_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_main_rectangles_fast(__pyx_self); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_main_rectangles_fast(CYTHON_UNUSED PyObject *__pyx_self) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("main_rectangles_fast", 0); __Pyx_XDECREF(__pyx_r); __pyx_t_1 = __pyx_f_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_main_rectangles_fast(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_AddTraceback("_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce.main_rectangles_fast", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; }
+17: cdef int n_rectangles = 10000000
__pyx_v_n_rectangles = 0x989680;
+18: cdef float threshold = 0.25
__pyx_v_threshold = 0.25;
+19: cdef Pool mem = Pool()
__pyx_t_1 = __Pyx_PyObject_CallNoArg(((PyObject *)__pyx_ptype_5cymem_5cymem_Pool)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_v_mem = ((struct __pyx_obj_5cymem_5cymem_Pool *)__pyx_t_1); __pyx_t_1 = 0;
+20: cdef Rectangle* rectangles = <Rectangle*>mem.alloc(n_rectangles, sizeof(Rectangle))
__pyx_t_2 = ((struct __pyx_vtabstruct_5cymem_5cymem_Pool *)__pyx_v_mem->__pyx_vtab)->alloc(__pyx_v_mem, __pyx_v_n_rectangles, (sizeof(struct __pyx_t_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_Rectangle))); if (unlikely(__pyx_t_2 == ((void *)NULL))) __PYX_ERR(0, 20, __pyx_L1_error)
__pyx_v_rectangles = ((struct __pyx_t_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_Rectangle *)__pyx_t_2);
+21: for i in range(n_rectangles):
__pyx_t_3 = __pyx_v_n_rectangles; __pyx_t_4 = __pyx_t_3; for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) { __pyx_v_i = __pyx_t_5;
+22: rectangles[i].w = random()
__pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_random); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 22, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __pyx_t_7 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_6))) { __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_6); if (likely(__pyx_t_7)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6); __Pyx_INCREF(__pyx_t_7); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_6, function); } } if (__pyx_t_7) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_7); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error) __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; } else { __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; __pyx_t_8 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_8 == (float)-1) && PyErr_Occurred())) __PYX_ERR(0, 22, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; (__pyx_v_rectangles[__pyx_v_i]).w = __pyx_t_8;
+23: rectangles[i].h = random()
__pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_random); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __pyx_t_7 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_6))) { __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_6); if (likely(__pyx_t_7)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6); __Pyx_INCREF(__pyx_t_7); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_6, function); } } if (__pyx_t_7) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_7); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; } else { __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 23, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; __pyx_t_8 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_8 == (float)-1) && PyErr_Occurred())) __PYX_ERR(0, 23, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; (__pyx_v_rectangles[__pyx_v_i]).h = __pyx_t_8; }
+24: n_out = check_rectangles_cy(rectangles, n_rectangles, threshold)
__pyx_v_n_out = __pyx_f_46_cython_magic_8305ca5d7d676d0e8a3d2abadd94b0ce_check_rectangles_cy(__pyx_v_rectangles, __pyx_v_n_rectangles, __pyx_v_threshold);
+25: print(n_out)
__pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_n_out); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 25, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
The important element here is that lines 11 to 13 are not highlighted which means they will be running at the fastest possible speed.
It's ok to have yellow lines in the main_rectangle_fast
function as this function will only be called once when we execute our program anyway. The yellow lines 22 and 23 are initialization lines that we could avoid by using a C level random function like stdlib rand()
but we didn't want to clutter this example.
Now here is an example of the previous cython program not optimized (with Python objects in the loop):
%%cython -a
from cymem.cymem cimport Pool
from random import random
cdef struct Rectangle:
float w
float h
cdef int check_rectangles_cy(Rectangle* rectangles, int n_rectangles, float threshold):
# ========== MODIFICATION ===========
# We changed the following line from `cdef int n_out = 0` to
n_out = 0
# n_out is not defined as an `int` anymore and is now thus a regular Python object
# ===================================
for rectangle in rectangles[:n_rectangles]:
if rectangle.w * rectangle.h > threshold:
n_out += 1
return n_out
cpdef main_rectangles_not_so_fast():
cdef int n_rectangles = 10000000
cdef float threshold = 0.25
cdef Pool mem = Pool()
cdef Rectangle* rectangles = <Rectangle*>mem.alloc(n_rectangles, sizeof(Rectangle))
for i in range(n_rectangles):
rectangles[i].w = random()
rectangles[i].h = random()
n_out = check_rectangles_cy(rectangles, n_rectangles, threshold)
print(n_out)
Generated by Cython 0.28.3
Yellow lines hint at Python interaction.
Click on a line that starts with a "+
" to see the C code that Cython generated for it.
01: from cymem.cymem cimport Pool
+02: from random import random
__pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __Pyx_INCREF(__pyx_n_s_random); __Pyx_GIVEREF(__pyx_n_s_random); PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_random); __pyx_t_2 = __Pyx_Import(__pyx_n_s_random, __pyx_t_1, 0); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_random); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); if (PyDict_SetItem(__pyx_d, __pyx_n_s_random, __pyx_t_1) < 0) __PYX_ERR(0, 2, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
03:
+04: cdef struct Rectangle:
struct __pyx_t_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_Rectangle { float w; float h; };
05: float w
06: float h
07:
+08: cdef int check_rectangles_cy(Rectangle* rectangles, int n_rectangles, float threshold):
static int __pyx_f_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_check_rectangles_cy(struct __pyx_t_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_Rectangle *__pyx_v_rectangles, int __pyx_v_n_rectangles, float __pyx_v_threshold) { PyObject *__pyx_v_n_out = NULL; struct __pyx_t_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_Rectangle __pyx_v_rectangle; int __pyx_r; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("check_rectangles_cy", 0); /* … */ /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_5); __Pyx_WriteUnraisable("_cython_magic_dbc2c06a712520185e24b7d477e83d8b.check_rectangles_cy", __pyx_clineno, __pyx_lineno, __pyx_filename, 1, 0); __pyx_r = 0; __pyx_L0:; __Pyx_XDECREF(__pyx_v_n_out); __Pyx_RefNannyFinishContext(); return __pyx_r; }
09: # ========== MODIFICATION ===========
10: # We changed the following line from `cdef int n_out = 0` to
+11: n_out = 0
__Pyx_INCREF(__pyx_int_0);
__pyx_v_n_out = __pyx_int_0;
12: # n_out is not defined as an `int` anymore and is now thus a regular Python object
13: # ===================================
+14: for rectangle in rectangles[:n_rectangles]:
__pyx_t_2 = (__pyx_v_rectangles + __pyx_v_n_rectangles); for (__pyx_t_3 = __pyx_v_rectangles; __pyx_t_3 < __pyx_t_2; __pyx_t_3++) { __pyx_t_1 = __pyx_t_3; __pyx_v_rectangle = (__pyx_t_1[0]);
+15: if rectangle.w * rectangle.h > threshold:
__pyx_t_4 = (((__pyx_v_rectangle.w * __pyx_v_rectangle.h) > __pyx_v_threshold) != 0); if (__pyx_t_4) { /* … */ } }
+16: n_out += 1
__pyx_t_5 = __Pyx_PyInt_AddObjC(__pyx_v_n_out, __pyx_int_1, 1, 1); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 16, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF_SET(__pyx_v_n_out, __pyx_t_5); __pyx_t_5 = 0;
+17: return n_out
__pyx_t_6 = __Pyx_PyInt_As_int(__pyx_v_n_out); if (unlikely((__pyx_t_6 == (int)-1) && PyErr_Occurred())) __PYX_ERR(0, 17, __pyx_L1_error) __pyx_r = __pyx_t_6; goto __pyx_L0;
18:
+19: cpdef main_rectangles_not_so_fast():
static PyObject *__pyx_pw_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_1main_rectangles_not_so_fast(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused); /*proto*/ static PyObject *__pyx_f_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_main_rectangles_not_so_fast(CYTHON_UNUSED int __pyx_skip_dispatch) { int __pyx_v_n_rectangles; float __pyx_v_threshold; struct __pyx_obj_5cymem_5cymem_Pool *__pyx_v_mem = 0; struct __pyx_t_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_Rectangle *__pyx_v_rectangles; int __pyx_v_i; int __pyx_v_n_out; PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("main_rectangles_not_so_fast", 0); /* … */ /* function exit code */ __pyx_r = Py_None; __Pyx_INCREF(Py_None); goto __pyx_L0; __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_6); __Pyx_XDECREF(__pyx_t_7); __Pyx_AddTraceback("_cython_magic_dbc2c06a712520185e24b7d477e83d8b.main_rectangles_not_so_fast", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = 0; __pyx_L0:; __Pyx_XDECREF((PyObject *)__pyx_v_mem); __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; } /* Python wrapper */ static PyObject *__pyx_pw_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_1main_rectangles_not_so_fast(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused); /*proto*/ static PyObject *__pyx_pw_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_1main_rectangles_not_so_fast(PyObject *__pyx_self, CYTHON_UNUSED PyObject *unused) { PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("main_rectangles_not_so_fast (wrapper)", 0); __pyx_r = __pyx_pf_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_main_rectangles_not_so_fast(__pyx_self); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } static PyObject *__pyx_pf_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_main_rectangles_not_so_fast(CYTHON_UNUSED PyObject *__pyx_self) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("main_rectangles_not_so_fast", 0); __Pyx_XDECREF(__pyx_r); __pyx_t_1 = __pyx_f_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_main_rectangles_not_so_fast(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 19, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; goto __pyx_L0; /* function exit code */ __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); __Pyx_AddTraceback("_cython_magic_dbc2c06a712520185e24b7d477e83d8b.main_rectangles_not_so_fast", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; __Pyx_XGIVEREF(__pyx_r); __Pyx_RefNannyFinishContext(); return __pyx_r; }
+20: cdef int n_rectangles = 10000000
__pyx_v_n_rectangles = 0x989680;
+21: cdef float threshold = 0.25
__pyx_v_threshold = 0.25;
+22: cdef Pool mem = Pool()
__pyx_t_1 = __Pyx_PyObject_CallNoArg(((PyObject *)__pyx_ptype_5cymem_5cymem_Pool)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 22, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_v_mem = ((struct __pyx_obj_5cymem_5cymem_Pool *)__pyx_t_1); __pyx_t_1 = 0;
+23: cdef Rectangle* rectangles = <Rectangle*>mem.alloc(n_rectangles, sizeof(Rectangle))
__pyx_t_2 = ((struct __pyx_vtabstruct_5cymem_5cymem_Pool *)__pyx_v_mem->__pyx_vtab)->alloc(__pyx_v_mem, __pyx_v_n_rectangles, (sizeof(struct __pyx_t_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_Rectangle))); if (unlikely(__pyx_t_2 == ((void *)NULL))) __PYX_ERR(0, 23, __pyx_L1_error)
__pyx_v_rectangles = ((struct __pyx_t_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_Rectangle *)__pyx_t_2);
+24: for i in range(n_rectangles):
__pyx_t_3 = __pyx_v_n_rectangles; __pyx_t_4 = __pyx_t_3; for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_4; __pyx_t_5+=1) { __pyx_v_i = __pyx_t_5;
+25: rectangles[i].w = random()
__pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_random); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 25, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __pyx_t_7 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_6))) { __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_6); if (likely(__pyx_t_7)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6); __Pyx_INCREF(__pyx_t_7); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_6, function); } } if (__pyx_t_7) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_7); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error) __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; } else { __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 25, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; __pyx_t_8 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_8 == (float)-1) && PyErr_Occurred())) __PYX_ERR(0, 25, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; (__pyx_v_rectangles[__pyx_v_i]).w = __pyx_t_8;
+26: rectangles[i].h = random()
__pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_random); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 26, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __pyx_t_7 = NULL; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_6))) { __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_6); if (likely(__pyx_t_7)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_6); __Pyx_INCREF(__pyx_t_7); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_6, function); } } if (__pyx_t_7) { __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_t_6, __pyx_t_7); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 26, __pyx_L1_error) __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; } else { __pyx_t_1 = __Pyx_PyObject_CallNoArg(__pyx_t_6); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 26, __pyx_L1_error) } __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; __pyx_t_8 = __pyx_PyFloat_AsFloat(__pyx_t_1); if (unlikely((__pyx_t_8 == (float)-1) && PyErr_Occurred())) __PYX_ERR(0, 26, __pyx_L1_error) __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; (__pyx_v_rectangles[__pyx_v_i]).h = __pyx_t_8; }
+27: n_out = check_rectangles_cy(rectangles, n_rectangles, threshold)
__pyx_v_n_out = __pyx_f_46_cython_magic_dbc2c06a712520185e24b7d477e83d8b_check_rectangles_cy(__pyx_v_rectangles, __pyx_v_n_rectangles, __pyx_v_threshold);
+28: print(n_out)
__pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_n_out); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 28, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); __pyx_t_6 = __Pyx_PyObject_CallOneArg(__pyx_builtin_print, __pyx_t_1); if (unlikely(!__pyx_t_6)) __PYX_ERR(0, 28, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_6); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
We can see that line 16 in the loop of check_rectangles_cy
is highlighted, indicating that the Cython compiler had to add some Python API overhead.
Our blog post go in some details about the way spaCy can help you speed up your code by using Cython for NLP.
Here is a short summary of the post:
Generally speaking: unless you know what you are doing, avoid using C strings where possible and use Python string objects instead.
StringStore
TokenC
and LexemeC
The StringStore
object is accessible from everywhere in spaCy and every object (see on the left), for example as nlp.vocab.strings
, doc.vocab.strings
or span.doc.vocab.string
:
Here is now a simple example of NLP processing in Cython.
First let's build a list of big documents and parse them using spaCy (this takes a few minutes):
import urllib.request
import spacy
# Build a dataset of 10 parsed document extracted from the Wikitext-2 dataset
with urllib.request.urlopen('https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt') as response:
text = response.read()
nlp = spacy.load('en')
doc_list = list(nlp(text[:800000].decode('utf8')) for i in range(10))
We have about 1.7 million tokens ("words") in our dataset:
sum(len(doc) for doc in doc_list)
1716200
We want to perform some NLP task on this dataset.
For example, we would like to count the number of times the word "run" is used as a noun in the dataset (i.e. tagged with a "NN" Part-Of-Speech tag).
A Python loop to do that is short and straightforward:
def slow_loop(doc_list, word, tag):
n_out = 0
for doc in doc_list:
for tok in doc:
if tok.lower_ == word and tok.tag_ == tag:
n_out += 1
return n_out
def main_nlp_slow(doc_list):
n_out = slow_loop(doc_list, 'run', 'NN')
print(n_out)
%%time
# But it's also quite slow
main_nlp_slow(doc_list)
90 CPU times: user 1.3 s, sys: 60.2 ms, total: 1.36 s Wall time: 1.41 s
On my laptop this code takes about 1.4 second to get the answer.
Let's try to speed this up with spaCy and a bit of Cython.
First, we have to think about the data structure. We will need a C level array for the dataset, with pointers to each document's TokenC array. We'll also need to convert the strings we use for testing to 64-bit hashes: "run" and "NN". When all the data required for our processing is in C level objects, we can then iterate at full C speed over the dataset.
Here is how this example can be written in Cython with spaCy:
%%cython -+
import numpy # Sometime we have a fail to import numpy compilation error if we don't import numpy
from cymem.cymem cimport Pool
from spacy.tokens.doc cimport Doc
from spacy.typedefs cimport hash_t
from spacy.structs cimport TokenC
cdef struct DocElement:
TokenC* c
int length
cdef int fast_loop(DocElement* docs, int n_docs, hash_t word, hash_t tag):
cdef int n_out = 0
for doc in docs[:n_docs]:
for c in doc.c[:doc.length]:
if c.lex.lower == word and c.tag == tag:
n_out += 1
return n_out
cpdef main_nlp_fast(doc_list):
cdef int i, n_out, n_docs = len(doc_list)
cdef Pool mem = Pool()
cdef DocElement* docs = <DocElement*>mem.alloc(n_docs, sizeof(DocElement))
cdef Doc doc
for i, doc in enumerate(doc_list): # Populate our database structure
docs[i].c = doc.c
docs[i].length = (<Doc>doc).length
word_hash = doc.vocab.strings.add('run')
tag_hash = doc.vocab.strings.add('NN')
n_out = fast_loop(docs, n_docs, word_hash, tag_hash)
print(n_out)
%%time
main_nlp_fast(doc_list)
90 CPU times: user 20.6 ms, sys: 405 µs, total: 21 ms Wall time: 21 ms
The code is a bit longer because we have to declare and populate the C structures in main_nlp_fast
before calling our Cython function.
But it is also a lot faster! In my Jupyter notebook, this cython code takes about 21 milliseconds to run on my laptop which is about 60 times faster than our previous pure Python loop.
The absolute speed is also impressive for a module written in an interactive Jupyter Notebook and which can interface natively with other Python modules and functions: scanning ~1,7 million words in 18ms means we are processing a whopping 80 millions words per seconds.