Notebook

We want to group by multi-valued columns. For example, take this dataset:

In [1]:

import pandas as pd
data = pd.DataFrame([
        [['a', 'b'], ['x', 'y'], 3],
        [['a', 'c'], ['x', 'z'], 2],
    ], columns=['left', 'right', 'value'])
data

Out[1]:

	left	right	value
0	[a, b]	[x, y]	3
1	[a, c]	[x, z]	2

left and right have arrays as values.

We want the equivalent of a .groupby([left, right])['value'].sum() that would return:

left right value
   a     x     5    # (a, x) occurs in the first AND second row
   a     y     3    # from first row
   b     x     3    # from first row
   b     y     3    # from first row
   a     z     2    # from second row
   c     x     2    # from second row
   c     z     2    # from second row

Approach 1: Python¶

Let's write this as a series of for loops first:

In [2]:

def python_kpartite(data, left, right, value):
    result = {}
    for index, row in data.iterrows():            # Loop through every row
        for a in row[left]:                       # Loop through all values in the left cell
            for b in row[right]:                  # ... and the right cell
                if (a, b) in result:              # Add value to the results counter
                    result[a, b] += row[value]
                else:
                    result[a, b] = row[value]
    return result                                 # Return the dict

python_kpartite(data, 'left', 'right', 'value')

Out[2]:

{('a', 'x'): 5L,
 ('a', 'y'): 3L,
 ('a', 'z'): 2L,
 ('b', 'x'): 3L,
 ('b', 'y'): 3L,
 ('c', 'x'): 2L,
 ('c', 'z'): 2L}

This result is correct. But it is fairly slow even for small data.

In [5]:

smalldata = pd.concat([data] * 10000)
middata = pd.concat([data] * 100000)

In [6]:

%timeit python_kpartite(smalldata, 'left', 'right', 'value')

1 loops, best of 3: 2.46 s per loop

Python with Pandas¶

In [7]:

result = {}
def python_inner(row):
    for a in row['left']:
        for b in row['right']:
            if (a, b) in result:
                result[a, b] += row['value']
            else:
                result[a, b] = row['value']

In [8]:

%timeit smalldata.apply(python_inner, axis=1)

1 loops, best of 3: 1.44 s per loop

Approach 2: Cython¶

In [9]:

%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython

Naive Cython is not faster¶

In [10]:

%%cython

# This is a one-time compilation! If you re-run it, result will still be cached.
# Change this comment to recompile.
result = {}
def cython_inner(row):
    for a in row['left']:
        for b in row['right']:
            result[a, b] = result.get((a, b), 0) + row['value']

In [11]:

%timeit smalldata.apply(cython_inner, axis=1)

1 loops, best of 3: 1.42 s per loop

Approach 3: Cython on NumPy¶

See http://nbviewer.ipython.org/gist/tillahoffmann/296501acea231cbdf5e7 for profiling

In [10]:

#Load Robert Kern's line profiler
%load_ext line_profiler
import line_profiler

#Set compiler directives (cf. http://docs.cython.org/src/reference/compilation.html)
from Cython.Compiler.Options import directive_defaults

directive_defaults['linetrace'] = True
directive_defaults['binding'] = True

In [12]:

%%cython -a -f --compile-args=-DCYTHON_TRACE=1

cimport numpy as np

def cython_kpartite(np.ndarray npdata):
    cdef int index
    cdef np.ndarray row
    cdef list left, right
    cdef double value
    cdef str a, b
    cdef tuple key
    cdef dict result = {}
    for index in range(npdata.shape[0]):
        row = npdata[index]
        left = row[0]
        right = row[1]
        value = row[2]
        for a in left:
            for b in right:
                key = a, b
                result[key] = result.get(key, 0)  + value
    return result

Out[12]:

Cython: _cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2.pyx

Generated by Cython 0.23.4

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

01:

+02: cimport numpy as np

  __Pyx_TraceLine(2,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;

03:

+04: def cython_kpartite(np.ndarray npdata):

/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite(PyObject *__pyx_self, PyObject *__pyx_v_npdata); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite = {"cython_kpartite", (PyCFunction)__pyx_pw_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite, METH_O, 0};
static PyObject *__pyx_pw_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite(PyObject *__pyx_self, PyObject *__pyx_v_npdata) {
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cython_kpartite (wrapper)", 0);
  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_npdata), __pyx_ptype_5numpy_ndarray, 1, "npdata", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __pyx_r = __pyx_pf_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_cython_kpartite(__pyx_self, ((PyArrayObject *)__pyx_v_npdata));
  CYTHON_UNUSED int __pyx_lineno = 0;
  CYTHON_UNUSED const char *__pyx_filename = NULL;
  CYTHON_UNUSED int __pyx_clineno = 0;

  /* function exit code */
  goto __pyx_L0;
  __pyx_L1_error:;
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_cython_kpartite(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_npdata) {
  int __pyx_v_index;
  PyArrayObject *__pyx_v_row = 0;
  PyObject *__pyx_v_left = 0;
  PyObject *__pyx_v_right = 0;
  double __pyx_v_value;
  PyObject *__pyx_v_a = 0;
  PyObject *__pyx_v_b = 0;
  PyObject *__pyx_v_key = 0;
  PyObject *__pyx_v_result = 0;
  PyObject *__pyx_r = NULL;
  __Pyx_TraceDeclarations
  __Pyx_TraceFrameInit(__pyx_codeobj_)
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cython_kpartite", 0);
  __Pyx_TraceCall("cython_kpartite", __pyx_f[0], 4, 0, {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;});
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_6);
  __Pyx_XDECREF(__pyx_t_8);
  __Pyx_XDECREF(__pyx_t_9);
  __Pyx_XDECREF(__pyx_t_10);
  __Pyx_AddTraceback("_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2.cython_kpartite", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_XDECREF((PyObject *)__pyx_v_row);
  __Pyx_XDECREF(__pyx_v_left);
  __Pyx_XDECREF(__pyx_v_right);
  __Pyx_XDECREF(__pyx_v_a);
  __Pyx_XDECREF(__pyx_v_b);
  __Pyx_XDECREF(__pyx_v_key);
  __Pyx_XDECREF(__pyx_v_result);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_TraceReturn(__pyx_r, 0);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple__8 = PyTuple_Pack(10, __pyx_n_s_npdata, __pyx_n_s_index, __pyx_n_s_row, __pyx_n_s_left, __pyx_n_s_right, __pyx_n_s_value, __pyx_n_s_a, __pyx_n_s_b, __pyx_n_s_key, __pyx_n_s_result); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_tuple__8);
  __Pyx_GIVEREF(__pyx_tuple__8);
/* … */
  __Pyx_TraceLine(4,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_46_cython_magic_61780ddc8d4e3e1667c3753a9a52fcd2_1cython_kpartite, 0, __pyx_n_s_cython_kpartite, NULL, __pyx_n_s_cython_magic_61780ddc8d4e3e1667, __pyx_d, ((PyObject *)__pyx_codeobj_)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cython_kpartite, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 4; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;

 05:     cdef int index

 06:     cdef np.ndarray row

 07:     cdef list left, right

 08:     cdef double value

 09:     cdef str a, b

 10:     cdef tuple key

+11:     cdef dict result = {}

  __Pyx_TraceLine(11,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_v_result = ((PyObject*)__pyx_t_1);
  __pyx_t_1 = 0;

+12:     for index in range(npdata.shape[0]):

  __Pyx_TraceLine(12,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_2 = (__pyx_v_npdata->dimensions[0]);
  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
    __pyx_v_index = __pyx_t_3;

+13:         row = npdata[index]

    __Pyx_TraceLine(13,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_npdata), __pyx_v_index, int, 1, __Pyx_PyInt_From_int, 0, 1, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
    __Pyx_GOTREF(__pyx_t_1);
    if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    __Pyx_XDECREF_SET(__pyx_v_row, ((PyArrayObject *)__pyx_t_1));
    __pyx_t_1 = 0;

+14:         left = row[0]

    __Pyx_TraceLine(14,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_row), 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
    __Pyx_GOTREF(__pyx_t_1);
    if (!(likely(PyList_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "list", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 14; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    __Pyx_XDECREF_SET(__pyx_v_left, ((PyObject*)__pyx_t_1));
    __pyx_t_1 = 0;

+15:         right = row[1]

    __Pyx_TraceLine(15,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_row), 1, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
    __Pyx_GOTREF(__pyx_t_1);
    if (!(likely(PyList_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "list", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    __Pyx_XDECREF_SET(__pyx_v_right, ((PyObject*)__pyx_t_1));
    __pyx_t_1 = 0;

+16:         value = row[2]

    __Pyx_TraceLine(16,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    __pyx_t_1 = __Pyx_GetItemInt(((PyObject *)__pyx_v_row), 2, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
    __Pyx_GOTREF(__pyx_t_1);
    __pyx_t_4 = __pyx_PyFloat_AsDouble(__pyx_t_1); if (unlikely((__pyx_t_4 == (double)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    __pyx_v_value = __pyx_t_4;

+17:         for a in left:

    __Pyx_TraceLine(17,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    if (unlikely(__pyx_v_left == Py_None)) {
      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    }
    __pyx_t_1 = __pyx_v_left; __Pyx_INCREF(__pyx_t_1); __pyx_t_5 = 0;
    for (;;) {
      if (__pyx_t_5 >= PyList_GET_SIZE(__pyx_t_1)) break;
      #if CYTHON_COMPILING_IN_CPYTHON
      __pyx_t_6 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_5); __Pyx_INCREF(__pyx_t_6); __pyx_t_5++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
      #else
      __pyx_t_6 = PySequence_ITEM(__pyx_t_1, __pyx_t_5); __pyx_t_5++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
      __Pyx_GOTREF(__pyx_t_6);
      #endif
      if (!(likely(PyString_CheckExact(__pyx_t_6))||((__pyx_t_6) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "str", Py_TYPE(__pyx_t_6)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
      __Pyx_XDECREF_SET(__pyx_v_a, ((PyObject*)__pyx_t_6));
      __pyx_t_6 = 0;
/* … */
      __Pyx_TraceLine(17,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    }
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  }

+18:             for b in right:

      __Pyx_TraceLine(18,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
      if (unlikely(__pyx_v_right == Py_None)) {
        PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
      }
      __pyx_t_6 = __pyx_v_right; __Pyx_INCREF(__pyx_t_6); __pyx_t_7 = 0;
      for (;;) {
        if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_6)) break;
        #if CYTHON_COMPILING_IN_CPYTHON
        __pyx_t_8 = PyList_GET_ITEM(__pyx_t_6, __pyx_t_7); __Pyx_INCREF(__pyx_t_8); __pyx_t_7++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        #else
        __pyx_t_8 = PySequence_ITEM(__pyx_t_6, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_8);
        #endif
        if (!(likely(PyString_CheckExact(__pyx_t_8))||((__pyx_t_8) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "str", Py_TYPE(__pyx_t_8)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_XDECREF_SET(__pyx_v_b, ((PyObject*)__pyx_t_8));
        __pyx_t_8 = 0;
/* … */
        __Pyx_TraceLine(18,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
      }
      __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;

+19:                 key = a, b

        __Pyx_TraceLine(19,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
        __pyx_t_8 = PyTuple_New(2); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_8);
        __Pyx_INCREF(__pyx_v_a);
        __Pyx_GIVEREF(__pyx_v_a);
        PyTuple_SET_ITEM(__pyx_t_8, 0, __pyx_v_a);
        __Pyx_INCREF(__pyx_v_b);
        __Pyx_GIVEREF(__pyx_v_b);
        PyTuple_SET_ITEM(__pyx_t_8, 1, __pyx_v_b);
        __Pyx_XDECREF_SET(__pyx_v_key, ((PyObject*)__pyx_t_8));
        __pyx_t_8 = 0;

+20:                 result[key] = result.get(key, 0)  + value

        __Pyx_TraceLine(20,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
        __pyx_t_8 = __Pyx_PyDict_GetItemDefault(__pyx_v_result, __pyx_v_key, __pyx_int_0); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_8);
        __pyx_t_9 = PyFloat_FromDouble(__pyx_v_value); if (unlikely(!__pyx_t_9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_9);
        __pyx_t_10 = PyNumber_Add(__pyx_t_8, __pyx_t_9); if (unlikely(!__pyx_t_10)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_10);
        __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
        __Pyx_DECREF(__pyx_t_9); __pyx_t_9 = 0;
        if (unlikely(PyDict_SetItem(__pyx_v_result, __pyx_v_key, __pyx_t_10) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0;

+21:     return result

  __Pyx_TraceLine(21,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __Pyx_XDECREF(__pyx_r);
  __Pyx_INCREF(__pyx_v_result);
  __pyx_r = __pyx_v_result;
  goto __pyx_L0;

In [13]:

%timeit cython_kpartite(middata.values)

1 loops, best of 3: 294 ms per loop

Optimisation¶

In [14]:

%%cython -a -f --compile-args=-DCYTHON_TRACE=1

cimport numpy as np

# cimport cython
# @cython.boundscheck(False)
# @cython.wraparound(False)
# @cython.overflowcheck(False)
# @cython.nonecheck(False)
def cython_kpartite2(                           # Split parameters on input (-80ms)
        np.ndarray[list] leftarray,
        np.ndarray[list] rightarray,
        np.ndarray[double] valuearray):
    cdef int index, n = leftarray.shape[0]
    # cdef list left, right
    # cdef double value
    result = {}
    for index in range(n):
        right = rightarray[index]
        value = valuearray[index]
        for a in leftarray[index]:
            for b in right:
                key = a, b                     # TODO: Use a string instead of a tuple (saves -80ms)
                result[key] = result.get(key, 0.0) + value
    return result

Out[14]:

Cython: _cython_magic_16adb21d1da83629b403b2790d247d8a.pyx

Generated by Cython 0.23.4

Yellow lines hint at Python interaction.
Click on a line that starts with a "+" to see the C code that Cython generated for it.

01:

+02: cimport numpy as np

  __Pyx_TraceLine(2,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 2; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;

03:

 04: # cimport cython

 05: # @cython.boundscheck(False)

 06: # @cython.wraparound(False)

 07: # @cython.overflowcheck(False)

 08: # @cython.nonecheck(False)

+09: def cython_kpartite2(                           # Split parameters on input (-80ms)

/* Python wrapper */
static PyObject *__pyx_pw_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
static PyMethodDef __pyx_mdef_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2 = {"cython_kpartite2", (PyCFunction)__pyx_pw_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2, METH_VARARGS|METH_KEYWORDS, 0};
static PyObject *__pyx_pw_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
  PyArrayObject *__pyx_v_leftarray = 0;
  PyArrayObject *__pyx_v_rightarray = 0;
  PyArrayObject *__pyx_v_valuearray = 0;
  PyObject *__pyx_r = 0;
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cython_kpartite2 (wrapper)", 0);
  {
    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_leftarray,&__pyx_n_s_rightarray,&__pyx_n_s_valuearray,0};
    PyObject* values[3] = {0,0,0};
    if (unlikely(__pyx_kwds)) {
      Py_ssize_t kw_args;
      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
      switch (pos_args) {
        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
        case  0: break;
        default: goto __pyx_L5_argtuple_error;
      }
      kw_args = PyDict_Size(__pyx_kwds);
      switch (pos_args) {
        case  0:
        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_leftarray)) != 0)) kw_args--;
        else goto __pyx_L5_argtuple_error;
        case  1:
        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_rightarray)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("cython_kpartite2", 1, 3, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
        }
        case  2:
        if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_valuearray)) != 0)) kw_args--;
        else {
          __Pyx_RaiseArgtupleInvalid("cython_kpartite2", 1, 3, 3, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
        }
      }
      if (unlikely(kw_args > 0)) {
        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "cython_kpartite2") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
      }
    } else if (PyTuple_GET_SIZE(__pyx_args) != 3) {
      goto __pyx_L5_argtuple_error;
    } else {
      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
      values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
    }
    __pyx_v_leftarray = ((PyArrayObject *)values[0]);
    __pyx_v_rightarray = ((PyArrayObject *)values[1]);
    __pyx_v_valuearray = ((PyArrayObject *)values[2]);
  }
  goto __pyx_L4_argument_unpacking_done;
  __pyx_L5_argtuple_error:;
  __Pyx_RaiseArgtupleInvalid("cython_kpartite2", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
  __pyx_L3_error:;
  __Pyx_AddTraceback("_cython_magic_16adb21d1da83629b403b2790d247d8a.cython_kpartite2", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __Pyx_RefNannyFinishContext();
  return NULL;
  __pyx_L4_argument_unpacking_done:;
  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_leftarray), __pyx_ptype_5numpy_ndarray, 1, "leftarray", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 10; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_rightarray), __pyx_ptype_5numpy_ndarray, 1, "rightarray", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_valuearray), __pyx_ptype_5numpy_ndarray, 1, "valuearray", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 12; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __pyx_r = __pyx_pf_46_cython_magic_16adb21d1da83629b403b2790d247d8a_cython_kpartite2(__pyx_self, __pyx_v_leftarray, __pyx_v_rightarray, __pyx_v_valuearray);
  int __pyx_lineno = 0;
  const char *__pyx_filename = NULL;
  int __pyx_clineno = 0;

  /* function exit code */
  goto __pyx_L0;
  __pyx_L1_error:;
  __pyx_r = NULL;
  __pyx_L0:;
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}

static PyObject *__pyx_pf_46_cython_magic_16adb21d1da83629b403b2790d247d8a_cython_kpartite2(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_leftarray, PyArrayObject *__pyx_v_rightarray, PyArrayObject *__pyx_v_valuearray) {
  int __pyx_v_index;
  int __pyx_v_n;
  PyObject *__pyx_v_result = NULL;
  PyObject *__pyx_v_right = NULL;
  PyObject *__pyx_v_value = NULL;
  PyObject *__pyx_v_a = NULL;
  PyObject *__pyx_v_b = NULL;
  PyObject *__pyx_v_key = NULL;
  __Pyx_LocalBuf_ND __pyx_pybuffernd_leftarray;
  __Pyx_Buffer __pyx_pybuffer_leftarray;
  __Pyx_LocalBuf_ND __pyx_pybuffernd_rightarray;
  __Pyx_Buffer __pyx_pybuffer_rightarray;
  __Pyx_LocalBuf_ND __pyx_pybuffernd_valuearray;
  __Pyx_Buffer __pyx_pybuffer_valuearray;
  PyObject *__pyx_r = NULL;
  __Pyx_TraceDeclarations
  __Pyx_TraceFrameInit(__pyx_codeobj_)
  __Pyx_RefNannyDeclarations
  __Pyx_RefNannySetupContext("cython_kpartite2", 0);
  __Pyx_TraceCall("cython_kpartite2", __pyx_f[0], 9, 0, {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;});
  __pyx_pybuffer_leftarray.pybuffer.buf = NULL;
  __pyx_pybuffer_leftarray.refcount = 0;
  __pyx_pybuffernd_leftarray.data = NULL;
  __pyx_pybuffernd_leftarray.rcbuffer = &__pyx_pybuffer_leftarray;
  __pyx_pybuffer_rightarray.pybuffer.buf = NULL;
  __pyx_pybuffer_rightarray.refcount = 0;
  __pyx_pybuffernd_rightarray.data = NULL;
  __pyx_pybuffernd_rightarray.rcbuffer = &__pyx_pybuffer_rightarray;
  __pyx_pybuffer_valuearray.pybuffer.buf = NULL;
  __pyx_pybuffer_valuearray.refcount = 0;
  __pyx_pybuffernd_valuearray.data = NULL;
  __pyx_pybuffernd_valuearray.rcbuffer = &__pyx_pybuffer_valuearray;
  {
    __Pyx_BufFmt_StackElem __pyx_stack[1];
    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_leftarray.rcbuffer->pybuffer, (PyObject*)__pyx_v_leftarray, &__Pyx_TypeInfo_object, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  }
  __pyx_pybuffernd_leftarray.diminfo[0].strides = __pyx_pybuffernd_leftarray.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_leftarray.diminfo[0].shape = __pyx_pybuffernd_leftarray.rcbuffer->pybuffer.shape[0];
  {
    __Pyx_BufFmt_StackElem __pyx_stack[1];
    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_rightarray.rcbuffer->pybuffer, (PyObject*)__pyx_v_rightarray, &__Pyx_TypeInfo_object, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  }
  __pyx_pybuffernd_rightarray.diminfo[0].strides = __pyx_pybuffernd_rightarray.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_rightarray.diminfo[0].shape = __pyx_pybuffernd_rightarray.rcbuffer->pybuffer.shape[0];
  {
    __Pyx_BufFmt_StackElem __pyx_stack[1];
    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_valuearray.rcbuffer->pybuffer, (PyObject*)__pyx_v_valuearray, &__Pyx_TypeInfo_double, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  }
  __pyx_pybuffernd_valuearray.diminfo[0].strides = __pyx_pybuffernd_valuearray.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_valuearray.diminfo[0].shape = __pyx_pybuffernd_valuearray.rcbuffer->pybuffer.shape[0];
/* … */
  /* function exit code */
  __pyx_L1_error:;
  __Pyx_XDECREF(__pyx_t_1);
  __Pyx_XDECREF(__pyx_t_8);
  __Pyx_XDECREF(__pyx_t_12);
  __Pyx_XDECREF(__pyx_t_13);
  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_leftarray.rcbuffer->pybuffer);
    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_rightarray.rcbuffer->pybuffer);
    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_valuearray.rcbuffer->pybuffer);
  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
  __Pyx_AddTraceback("_cython_magic_16adb21d1da83629b403b2790d247d8a.cython_kpartite2", __pyx_clineno, __pyx_lineno, __pyx_filename);
  __pyx_r = NULL;
  goto __pyx_L2;
  __pyx_L0:;
  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_leftarray.rcbuffer->pybuffer);
  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_rightarray.rcbuffer->pybuffer);
  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_valuearray.rcbuffer->pybuffer);
  __pyx_L2:;
  __Pyx_XDECREF(__pyx_v_result);
  __Pyx_XDECREF(__pyx_v_right);
  __Pyx_XDECREF(__pyx_v_value);
  __Pyx_XDECREF(__pyx_v_a);
  __Pyx_XDECREF(__pyx_v_b);
  __Pyx_XDECREF(__pyx_v_key);
  __Pyx_XGIVEREF(__pyx_r);
  __Pyx_TraceReturn(__pyx_r, 0);
  __Pyx_RefNannyFinishContext();
  return __pyx_r;
}
/* … */
  __pyx_tuple__8 = PyTuple_Pack(11, __pyx_n_s_leftarray, __pyx_n_s_rightarray, __pyx_n_s_valuearray, __pyx_n_s_index, __pyx_n_s_n, __pyx_n_s_result, __pyx_n_s_right, __pyx_n_s_value, __pyx_n_s_a, __pyx_n_s_b, __pyx_n_s_key); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_tuple__8);
  __Pyx_GIVEREF(__pyx_tuple__8);
/* … */
  __Pyx_TraceLine(9,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_46_cython_magic_16adb21d1da83629b403b2790d247d8a_1cython_kpartite2, 0, __pyx_n_s_cython_kpartite2, NULL, __pyx_n_s_cython_magic_16adb21d1da83629b4, __pyx_d, ((PyObject *)__pyx_codeobj_)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cython_kpartite2, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;

 10:         np.ndarray[list] leftarray,

 11:         np.ndarray[list] rightarray,

 12:         np.ndarray[double] valuearray):

+13:     cdef int index, n = leftarray.shape[0]

  __Pyx_TraceLine(13,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_v_n = (__pyx_v_leftarray->dimensions[0]);

 14:     # cdef list left, right

 15:     # cdef double value

+16:     result = {}

  __Pyx_TraceLine(16,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_v_result = ((PyObject*)__pyx_t_1);
  __pyx_t_1 = 0;

+17:     for index in range(n):

  __Pyx_TraceLine(17,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __pyx_t_2 = __pyx_v_n;
  for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_2; __pyx_t_3+=1) {
    __pyx_v_index = __pyx_t_3;

+18:         right = rightarray[index]

    __Pyx_TraceLine(18,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    __pyx_t_4 = __pyx_v_index;
    __pyx_t_5 = -1;
    if (__pyx_t_4 < 0) {
      __pyx_t_4 += __pyx_pybuffernd_rightarray.diminfo[0].shape;
      if (unlikely(__pyx_t_4 < 0)) __pyx_t_5 = 0;
    } else if (unlikely(__pyx_t_4 >= __pyx_pybuffernd_rightarray.diminfo[0].shape)) __pyx_t_5 = 0;
    if (unlikely(__pyx_t_5 != -1)) {
      __Pyx_RaiseBufferIndexError(__pyx_t_5);
      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    }
    __pyx_t_1 = (PyObject *) *__Pyx_BufPtrStrided1d(PyObject **, __pyx_pybuffernd_rightarray.rcbuffer->pybuffer.buf, __pyx_t_4, __pyx_pybuffernd_rightarray.diminfo[0].strides);
    __Pyx_INCREF((PyObject*)__pyx_t_1);
    __Pyx_XDECREF_SET(__pyx_v_right, __pyx_t_1);
    __pyx_t_1 = 0;

+19:         value = valuearray[index]

    __Pyx_TraceLine(19,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    __pyx_t_6 = __pyx_v_index;
    __pyx_t_5 = -1;
    if (__pyx_t_6 < 0) {
      __pyx_t_6 += __pyx_pybuffernd_valuearray.diminfo[0].shape;
      if (unlikely(__pyx_t_6 < 0)) __pyx_t_5 = 0;
    } else if (unlikely(__pyx_t_6 >= __pyx_pybuffernd_valuearray.diminfo[0].shape)) __pyx_t_5 = 0;
    if (unlikely(__pyx_t_5 != -1)) {
      __Pyx_RaiseBufferIndexError(__pyx_t_5);
      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    }
    __pyx_t_1 = PyFloat_FromDouble((*__Pyx_BufPtrStrided1d(double *, __pyx_pybuffernd_valuearray.rcbuffer->pybuffer.buf, __pyx_t_6, __pyx_pybuffernd_valuearray.diminfo[0].strides))); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    __Pyx_GOTREF(__pyx_t_1);
    __Pyx_XDECREF_SET(__pyx_v_value, __pyx_t_1);
    __pyx_t_1 = 0;

+20:         for a in leftarray[index]:

    __Pyx_TraceLine(20,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    __pyx_t_7 = __pyx_v_index;
    __pyx_t_5 = -1;
    if (__pyx_t_7 < 0) {
      __pyx_t_7 += __pyx_pybuffernd_leftarray.diminfo[0].shape;
      if (unlikely(__pyx_t_7 < 0)) __pyx_t_5 = 0;
    } else if (unlikely(__pyx_t_7 >= __pyx_pybuffernd_leftarray.diminfo[0].shape)) __pyx_t_5 = 0;
    if (unlikely(__pyx_t_5 != -1)) {
      __Pyx_RaiseBufferIndexError(__pyx_t_5);
      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    }
    __pyx_t_1 = (PyObject *) *__Pyx_BufPtrStrided1d(PyObject **, __pyx_pybuffernd_leftarray.rcbuffer->pybuffer.buf, __pyx_t_7, __pyx_pybuffernd_leftarray.diminfo[0].strides);
    __Pyx_INCREF((PyObject*)__pyx_t_1);
    if (unlikely(__pyx_t_1 == Py_None)) {
      PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
    }
    __pyx_t_8 = __pyx_t_1; __Pyx_INCREF(__pyx_t_8); __pyx_t_9 = 0;
    __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
    for (;;) {
      if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_8)) break;
      #if CYTHON_COMPILING_IN_CPYTHON
      __pyx_t_1 = PyList_GET_ITEM(__pyx_t_8, __pyx_t_9); __Pyx_INCREF(__pyx_t_1); __pyx_t_9++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
      #else
      __pyx_t_1 = PySequence_ITEM(__pyx_t_8, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
      __Pyx_GOTREF(__pyx_t_1);
      #endif
      __Pyx_XDECREF_SET(__pyx_v_a, __pyx_t_1);
      __pyx_t_1 = 0;
/* … */
      __Pyx_TraceLine(20,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
    }
    __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
  }

+21:             for b in right:

      __Pyx_TraceLine(21,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
      if (likely(PyList_CheckExact(__pyx_v_right)) || PyTuple_CheckExact(__pyx_v_right)) {
        __pyx_t_1 = __pyx_v_right; __Pyx_INCREF(__pyx_t_1); __pyx_t_10 = 0;
        __pyx_t_11 = NULL;
      } else {
        __pyx_t_10 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_right); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_1);
        __pyx_t_11 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
      }
      for (;;) {
        if (likely(!__pyx_t_11)) {
          if (likely(PyList_CheckExact(__pyx_t_1))) {
            if (__pyx_t_10 >= PyList_GET_SIZE(__pyx_t_1)) break;
            #if CYTHON_COMPILING_IN_CPYTHON
            __pyx_t_12 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_10); __Pyx_INCREF(__pyx_t_12); __pyx_t_10++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
            #else
            __pyx_t_12 = PySequence_ITEM(__pyx_t_1, __pyx_t_10); __pyx_t_10++; if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
            __Pyx_GOTREF(__pyx_t_12);
            #endif
          } else {
            if (__pyx_t_10 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
            #if CYTHON_COMPILING_IN_CPYTHON
            __pyx_t_12 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_10); __Pyx_INCREF(__pyx_t_12); __pyx_t_10++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
            #else
            __pyx_t_12 = PySequence_ITEM(__pyx_t_1, __pyx_t_10); __pyx_t_10++; if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
            __Pyx_GOTREF(__pyx_t_12);
            #endif
          }
        } else {
          __pyx_t_12 = __pyx_t_11(__pyx_t_1);
          if (unlikely(!__pyx_t_12)) {
            PyObject* exc_type = PyErr_Occurred();
            if (exc_type) {
              if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
              else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
            }
            break;
          }
          __Pyx_GOTREF(__pyx_t_12);
        }
        __Pyx_XDECREF_SET(__pyx_v_b, __pyx_t_12);
        __pyx_t_12 = 0;
/* … */
        __Pyx_TraceLine(21,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
      }
      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;

+22:                 key = a, b                     # TODO: Use a string instead of a tuple (saves -80ms)

        __Pyx_TraceLine(22,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
        __pyx_t_12 = PyTuple_New(2); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_12);
        __Pyx_INCREF(__pyx_v_a);
        __Pyx_GIVEREF(__pyx_v_a);
        PyTuple_SET_ITEM(__pyx_t_12, 0, __pyx_v_a);
        __Pyx_INCREF(__pyx_v_b);
        __Pyx_GIVEREF(__pyx_v_b);
        PyTuple_SET_ITEM(__pyx_t_12, 1, __pyx_v_b);
        __Pyx_XDECREF_SET(__pyx_v_key, ((PyObject*)__pyx_t_12));
        __pyx_t_12 = 0;

+23:                 result[key] = result.get(key, 0.0) + value

        __Pyx_TraceLine(23,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
        __pyx_t_12 = __Pyx_PyDict_GetItemDefault(__pyx_v_result, __pyx_v_key, __pyx_float_0_0); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_12);
        __pyx_t_13 = PyNumber_Add(__pyx_t_12, __pyx_v_value); if (unlikely(!__pyx_t_13)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_GOTREF(__pyx_t_13);
        __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
        if (unlikely(PyDict_SetItem(__pyx_v_result, __pyx_v_key, __pyx_t_13) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
        __Pyx_DECREF(__pyx_t_13); __pyx_t_13 = 0;

+24:     return result

  __Pyx_TraceLine(24,0,{__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;})
  __Pyx_XDECREF(__pyx_r);
  __Pyx_INCREF(__pyx_v_result);
  __pyx_r = __pyx_v_result;
  goto __pyx_L0;

In [15]:

%timeit cython_kpartite2(middata.left.values, middata.right.values, middata.value.astype(float).values)

1 loops, best of 3: 219 ms per loop

In [16]:

profile = line_profiler.LineProfiler(cython_kpartite2)
profile.runcall(cython_kpartite2, middata.left.values, middata.right.values, middata.value.astype(float).values)
profile.print_stats()

Timer unit: 4.66511e-07 s

Total time: 1.90599 s
File: D:\cygwin64\home\Anand\.ipython\cython\_cython_magic_16adb21d1da83629b403b2790d247d8a.pyx
Function: cython_kpartite2 at line 9

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     9                                           def cython_kpartite2(                           # Split parameters on input (-80ms)
    10                                                   np.ndarray[list] leftarray,
    11                                                   np.ndarray[list] rightarray,
    12                                                   np.ndarray[double] valuearray):
    13         1            4      4.0      0.0      cdef int index, n = leftarray.shape[0]
    14                                               # cdef list left, right
    15                                               # cdef double value
    16         1            1      1.0      0.0      result = {}
    17         1            1      1.0      0.0      for index in range(n):
    18    200000       187425      0.9      4.6          right = rightarray[index]
    19    200000       188811      0.9      4.6          value = valuearray[index]
    20    600000       570909      1.0     14.0          for a in leftarray[index]:
    21   1200000      1132739      0.9     27.7              for b in right:
    22    800000       794150      1.0     19.4                  key = a, b                     # TODO: Use a string instead of a tuple (saves -80ms)
    23    800000      1211571      1.5     29.7                  result[key] = result.get(key, 0.0) + value
    24         1            8      8.0      0.0      return result

In [33]:

%%cython

cimport numpy as np

def bipartite(np.ndarray[list] leftarray, np.ndarray[list] rightarray):
    cdef int index, n = leftarray.shape[0]
    cdef dict result = {}
    cdef list right
    cdef str a, b
    for index in range(n):
        right = rightarray[index]
        for a in leftarray[index]:
            for b in right:
                key = a, b
                result[key] = result.get(key, 0) + 1
    return result

In [34]:

%timeit bipartite(middata.left.values, middata.right.values)

1 loops, best of 3: 196 ms per loop

Generalised K-Partite¶

Let's generalise the problem. If we want multi-partite counts across multiple columns, and each could be multi-valued, like this:

ColA    ColB    ColC
A1,A2   B1      C1,C2,C3
A1,A3   -       C1
A1      B1,B2   C2,C3

In that case, the link pairs we want are:

ColA    ColB    ColC       =>    Internal links               External links
A1,A2   B1      C1,C2,C3         A1-A2, C1-C2, C1-C3, C2-C3   A1-B1, A1-C1, A1-C2, A1-C3, A2-B1, A2-C1, A2-C2, A2-C3
A1,A3   -       C1               A1-A3                        A1-C1, A3-C1
A1      B1,B2   C2,C3            B1-B2, C2-C3                 A1-B1, A1-B2, A1-C2, A1-C3, B1-C2, B1-C3, B2-C2, B2-C3

The psuedo-code for this is:

for row in data:                                              # Handle every row independently
    for column1 in columns:                                   # Loop through every column
        for value1 in row[column1]:                           # Loop through each item in the list
            for column2 in columns:                           # Loop through every column again
                if column2 < column1:                         # But ignore columns already examined
                    continue
               for value2 in row[column2]:                    # Loop through each item in the list
                   count[value1, value2] += 1                 # Increment the counter for every pair of values

Let's get this right in Python first.

In [1]:

import pandas as pd
from collections import Counter

In [2]:

data = pd.DataFrame({
        'ColA': [['A1', 'A2'], ['A1','A3'], ['A1']],
        'ColB': [['B1'], [], ['B1', 'B2']],
        'ColC': [['C1', 'C2', 'C3'], ['C1'], ['C2', 'C3']]
    })

In [3]:

def python_kpartite(data):
    columns = range(len(data.columns))
    count = Counter()
    for row in data.values:
        for column1 in columns:
            for value1 in row[column1]:
                for column2 in columns:
                    if column2 < column1:
                        continue
                    for value2 in row[column2]:
                        count[value1, value2] += 1
    return pd.Series(count)

How fast is it?¶

Do we even need to optimise it? Let's take a look at it's speed on 30,000 rows:

In [4]:

%timeit python_kpartite(pd.concat([data]*10000))

1 loops, best of 3: 1.34 s per loop

Let's look at some real-life data

In [5]:

airline = pd.read_csv('airline-patents.csv')

# Convert all string columns to arrays -- split by ";" and deduplicate non-empty values
# columns = [col for col in airline.columns if airline[col].dtype.kind == 'O']
columns = ['PA', 'IPC', 'INV', 'keywords']
for col in columns:
    airline[col] = airline[col].str.split(';').apply(lambda v: [x for x in set(v) if x])

In [6]:

airline[columns].head()

Out[6]:

	PA	IPC	INV	keywords
0	[MESSERSCHMITT BOELKOW BLOHM]	[G10K, B64C, F04D]	[BSCHORR OSKAR]	[angular range, control system, fuselage skin,...
1	[MESSERSCHMITT BOELKOW BLOHM]	[F02C, B64D]	[JUNGCLAUS GUENTHER, KROHN ERNST OTTO]	[air outlet port, air passage, missile body, a...
2	[CRESSWELL ARNOLD W, VOSS WALDEMAR E, ERNSTING...	[A62B, B64D]	[CRESSWELL ARNOLD W, VOSS WALDEMAR E, ERNSTING...	[air supply duct, sectional area, non-return v...
3	[Unassigned]	[C08L]	[CHEN JOHN Y]	[gelatinous compositions, soft gelatinous elas...
4	[Unassigned]	[B01D, C10G]	[CHEN JOHN Y]	[gel rigidity region, metal foil, synthetic re...

In [7]:

len(airline)

Out[7]:

In [8]:

%timeit python_kpartite(airline[columns].head(1000))

1 loops, best of 3: 770 ms per loop

We'd like this to be at much faster.

Naive Cython¶

Let's just copy the same algorithm into Cython and see how fast it is.

In [9]:

%load_ext Cython

In [10]:

%%cython

from collections import Counter

def cython_kpartite(data):
    columns = range(len(data.columns))
    count = Counter()
    for row in data.values:
        for column1 in columns:
            for value1 in row[column1]:
                for column2 in columns:
                    if column2 < column1:
                        continue
                    for value2 in row[column2]:
                        count[value1, value2] += 1
    return count

In [11]:

%timeit cython_kpartite(airline[columns].head(1000))

1 loops, best of 3: 292 ms per loop

It's the counting that's slow¶

Actually, let's run the same algorithm but instead of incrementing a collections.Counter, lets increment an integer and see how much time it takes.

In [12]:

%%cython

def cython_kpartite(data):
    columns = range(len(data.columns))
    count = 0
    for row in data.values:
        for column1 in columns:
            for value1 in row[column1]:
                for column2 in columns:
                    if column2 < column1:
                        continue
                    for value2 in row[column2]:
                        count += 1
    return count

In [13]:

%timeit cython_kpartite(airline[columns].head(1000))

100 loops, best of 3: 9.72 ms per loop

That's much faster. Not bad, but not good enough. Plus, we need to see how to optimise collections.Counter.

We're creating too many links!¶

Let's see how many links this will create:

In [14]:

print '{:,d}'.format(cython_kpartite(airline[columns].head(1000)))
print '{:,d}'.format(cython_kpartite(airline[columns].head(20000)))

303,550
3,722,267

That's a HUGE number links.

Obviously, we're not interested in the long tail. We'd be primarily interested in the connections between the largest nodes. So we need to rework our approach:

Find the N largest nodes (where N can be specified by the user)
Find the links between the largest nodes

Find the N largest nodes¶

Since every value is associated with every other value in a row, calculating this is simple:

In [15]:

%%cython -f --compile-args=-DCYTHON_TRACE=1

from collections import defaultdict

def cython_count(data):
    cdef int ncols = len(data.columns)              # ncols = number of columns in data
    cdef int rowsize                                # rowsize = number of nodes in each row
    count = defaultdict(int)                        # count[node] = number of links the node has
    for row in data.values:                         # We loop through each row
        rowsize = 0
        for col in range(ncols):                    # ... and calculate the number of nodes in this row
            rowsize += len(row[col])
        for col in range(ncols):                    # In every column...
            values = row[col]
            for index in range(len(values)):        # ... for every node...
                count[col, values[index]] += rowsize     # ... we add the number of nodes in the row to its count
    
    return count                                    # Count is now a dict that has the number of links against each node

In [16]:

%timeit cython_count(airline[columns].head(100000))

10 loops, best of 3: 131 ms per loop

Incidentally, even here, the bulk of the time goes into defaultdict(int). We need a faster counter.

Link the top N nodes¶

Let's just create the links for the top nodes. We can skip anything that's not in that list.

Here's what we could do in every row:

Get the (column, value) for every column and value
Map it to a node index
Increment all combinations of every node index pairs

In [17]:

%%cython

from collections import defaultdict

def cython_kpartite(data, count):
    cdef int ncols = len(data.columns)              # ncols = number of columns in data

    # For the top nodes, let's get the (column, value) -> node-index
    nodes = count.keys()
    index = {key: i for i, key in enumerate(count)}
    
    # Here's a slow implementation of the logic
    links = defaultdict(int)
    for row in data.values:
        indices = []
        for col in range(ncols):
            values = row[col]
            for i in range(len(values)):
                key = (col, values[i])
                if key in index:
                    indices.append(index[key])
        n = len(indices)
        for i in range(n):
            for j in range(i + 1, n):
                links[i, j] += 1
    return nodes, links

In [18]:

def top(data, top):
    return dict(sorted(data.items(), key=lambda v: -v[1])[:top])

count = cython_count(airline[columns].head(10000))
cython_kpartite(airline[columns].head(10000), top(count, 10))

Out[18]:

([(0, 'AIRBUS'),
  (0, 'BOEING CO'),
  (1, 'B64D'),
  (1, 'G05D'),
  (0, 'Unassigned'),
  (1, 'B64F'),
  (1, 'B64G'),
  (1, 'G06F'),
  (1, 'G01C'),
  (1, 'B64C')],
 defaultdict(int,
             {(0, 1): 3565,
              (0, 2): 799,
              (0, 3): 137,
              (0, 4): 10,
              (0, 5): 2,
              (1, 2): 799,
              (1, 3): 137,
              (1, 4): 10,
              (1, 5): 2,
              (2, 3): 137,
              (2, 4): 10,
              (2, 5): 2,
              (3, 4): 10,
              (3, 5): 2,
              (4, 5): 2}))

Memory views¶

Functionally this works fine. Now let's optimise it. Memory views let us access NumPy arrays' memory directly. Let's do that.

In [19]:

# Unoptimised time:
count = cython_count(airline[columns])
%timeit -r1 cython_kpartite(airline[columns].head(100000), top(count, 500))

1 loops, best of 1: 204 ms per loop

In [20]:

%%cython -f --compile-args=-DCYTHON_TRACE=1

from collections import defaultdict
import numpy as np
cimport numpy as np

def cython_kpartite_memoryview(data, count):
    cdef int ncols = len(data.columns)              # ncols = number of columns in data

    # For the top nodes, let's get the (column, value) -> node-index
    nodes = count.keys()
    index = {key: i for i, key in enumerate(count)}

    cdef int nodecount = len(nodes)
    cdef np.ndarray links = np.zeros([nodecount, nodecount], dtype=np.int)
    cdef int [:, :] link_view = links
    cdef int i, j, n
    for row in data.values:
        indices = []
        for col in range(ncols):
            values = row[col]
            for i in range(len(values)):
                key = (col, values[i])
                if key in index:
                    indices.append(index[key])
        n = len(indices)
        for i in range(n):
            for j in range(i + 1, n):
                link_view[i, j] += 1
    return count, np.argwhere(links > 0)

In [21]:

%timeit cython_kpartite_memoryview(airline[columns].head(100000), top(count, 500))

10 loops, best of 3: 116 ms per loop

That's a good improvement. Now let's put it together:

In [22]:

%%cython -f --compile-args=-DCYTHON_TRACE=1

from collections import defaultdict
import numpy as np
cimport numpy as np

def cython_kpartite_full(data, top=None):
    cdef int ncols = len(data.columns)              # ncols = number of columns in data
    cdef int rowsize                                # rowsize = number of nodes in each row
    count = defaultdict(int)                        # count[node] = number of links the node has
    for row in data.values:                         # We loop through each row
        rowsize = 0
        for col in range(ncols):                    # ... and calculate the number of nodes in this row
            rowsize += len(row[col])
        for col in range(ncols):                    # In every column...
            values = row[col]
            for index in range(len(values)):        # ... for every node...
                count[col, values[index]] += rowsize     # ... we add the number of nodes in the row to its count
    
    if callable(top):
        count = top(count)
    
    # For the top nodes, let's get the (column, value) -> node-index
    nodes = count.keys()
    index = {key: i for i, key in enumerate(count)}

    cdef int nodecount = len(nodes)
    cdef np.ndarray links = np.zeros([nodecount, nodecount], dtype=np.int)
    cdef int [:, :] link_view = links
    cdef int i, j, n
    for row in data.values:
        indices = []
        for col in range(ncols):
            values = row[col]
            for i in range(len(values)):
                key = (col, values[i])
                if key in index:
                    indices.append(index[key])
        n = len(indices)
        for i in range(n):
            for j in range(i + 1, n):
                link_view[indices[i], indices[j]] += 1
    return count, np.argwhere(links > 0)

warning: D:\cygwin64\home\Anand\.ipython\cython\_cython_magic_e58ac54db67eda082396b010cb77d72e.pyx:41:33: Index should be typed for more efficient access

In [23]:

%timeit -r1 cython_kpartite_full(airline[columns], top=lambda count: top(count, 500))

1 loops, best of 1: 284 ms per loop

Further optimisation¶

This is the optimised code:

In [24]:

%%cython

import numpy as np
cimport numpy as np

cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
def cython_kpartite_optimised(data, subset=None):
    # This block takes ~2s
    cdef int ncols = len(data.columns)              # ncols = number of columns in data
    cdef int rowsize                                # rowsize = number of nodes in each row
    cdef int i, j, n
    count = {}
    for row in data.values:                         # We loop through each row
        rowsize = 0
        for i in range(ncols):                      # ... and calculate the number of values in this row
            rowsize += len(row[i])
        for i in range(ncols):                      # Add this rowsize against each value in each column
            subcount = count.setdefault(i, {})
            for val in row[i]:
                subcount[val] = rowsize + subcount.get(val, 0)
                
    # Now, count[col_number][val] has the number of nodes it is associated with.
    # Use count[col_number, val] instead for convenience
    count = {(i, val): n for i in count for val, n in count[i].iteritems()}
                
    # If a subset() function is provided, restrict the number of nodes to that many.
    # Otherwise, restrict to top 500 nodes
    if callable(subset):
        count = subset(count)
    else:
        subset = subset if isinstance(subset, int) else 500
        count = dict(sorted(count.items(), key=lambda v: -v[1])[:subset])
    
    # index[col_number, node] = index of the corresponding node
    nodes = count.keys()
    index = {key: i for i, key in enumerate(count)}

    # Calculate the links array
    cdef int nodecount = len(nodes)
    cdef np.ndarray links = np.zeros([nodecount, nodecount], dtype=np.int)
    cdef int [:, :] link_view = links
    cdef int indices[50000]                         # Limit number of distinct values possible in a row
    for row in data.values:
        # This block takes 1.3s
        n = 0
        for i in range(ncols):
            for val in row[i]:
                key = (i, val)
                if key in index:
                    indices[n] = index[key]
                    n += 1
        # This block takes ~0.9s
        for i in range(n):
            for j in range(i + 1, n):
                link_view[indices[i], indices[j]] += 1
    return count, np.argwhere(links > 0)

In [25]:

%timeit -r1 cython_kpartite_optimised(airline[columns], subset=lambda count: top(count, 500))

1 loops, best of 1: 235 ms per loop

In [26]:

count, links = cython_kpartite_optimised(airline[columns], subset=lambda count: top(count, 500))

In [28]:

pd.Series(count).head()

Out[28]:

0  AEROSPATIALE                 1955
   AIRBUS                      22740
   ALLIED SIGNAL INC            1096
   APPLIED ELASTOMERICS INC     1744
   BE AEROSPACE INC             1313
dtype: int64

In [29]:

links

Out[29]:

array([[  0,  35],
       [  0,  55],
       [  0,  62],
       ..., 
       [499, 459],
       [499, 462],
       [499, 480]], dtype=int64)