In [1]:
import chainer
In [2]:
chainer.__version__
Out[2]:
'1.5.1'

numpy vs. cupy

In [3]:
import numpy as np
import cupy as cp

dot

In [4]:
def dot_np(a, b):
    return a.dot(b)
In [5]:
def dot_cp(a, b):
    return a.dot(b)
In [6]:
n = 1000
m = 10000
In [7]:
a_np = np.random.randn(n, m).astype('f')
b_np = np.random.randn(m, n).astype('f')
In [8]:
a_cp = cp.asarray(a_np)
b_cp = cp.asarray(b_np)
In [9]:
%timeit dot_np(a_np, b_np)
1 loops, best of 3: 811 ms per loop
In [10]:
%timeit dot_cp(a_cp, b_cp)
The slowest run took 2175.14 times longer than the fastest. This could mean that an intermediate result is being cached 
1 loops, best of 3: 110 µs per loop

norm

In [11]:
def norm_np(data):
    return np.linalg.norm(data, axis=1)
In [12]:
cp.linalg.norm(a_cp, axis=1)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-8377ddd70259> in <module>()
----> 1 cp.linalg.norm(a_cp, axis=1)

TypeError: 'module' object is not callable
In [13]:
def norm_cp(data):
    return cp.sqrt(cp.sum(data**2, axis=1))
In [14]:
%timeit norm_np(a_np)
100 loops, best of 3: 11.8 ms per loop
In [15]:
%timeit norm_cp(a_cp)
The slowest run took 1863.49 times longer than the fastest. This could mean that an intermediate result is being cached 
1 loops, best of 3: 158 µs per loop

Compatible codes

In [16]:
from chainer import cuda

dot (ufunc)

In [17]:
def dot(a, b):
    return a.dot(b)

Total variation (use chainer.Variable)

In [18]:
import chainer.functions as F

def tv(x_data, beta=2):
    xp = cuda.get_array_module(x_data)
    n, ch, h, w = x_data.shape
    
    Wh_data = xp.array([[[[1],[-1]]]], dtype='f')
    Ww_data = xp.array([[[[1, -1]]]], dtype='f')

    x = chainer.Variable(x_data.astype('f'))
    Wh = chainer.Variable(Wh_data)
    Ww = chainer.Variable(Ww_data)

    diffh = F.convolution_2d(F.reshape(x, (3, 1, h, w)), W=Wh)
    diffw = F.convolution_2d(F.reshape(x, (3, 1, h, w)), W=Ww)

    tv = (F.sum(diffh**2) + F.sum(diffw**2))**(beta / 2.)
    return tv

im2patch (not use chainer.Variable)

In [19]:
def get_patches_idx(image_size, patch_size, stride):
    l = image_size - patch_size
    return range(l)[::stride] + [l]

def im2patch(image, patch_size, stride):
    xp = cuda.get_array_module(image)
    ch, h, w = image.shape
    idx_h = get_patches_idx(h, patch_size, stride)
    idx_w = get_patches_idx(w, patch_size, stride)

    patches = xp.zeros((len(idx_h) * len(idx_w), ch, patch_size, patch_size),
                       dtype=image.dtype)
    for ih in xrange(len(idx_h)):
        hs = idx_h[ih]
        he = hs + patch_size
        for iw in xrange(len(idx_w)):
            ws = idx_w[iw]
            we = ws + patch_size
            patches[iw + ih * len(idx_h)] += image[:, hs:he, ws:we]
    return patches
In [20]:
img_np = np.random.randn(3, 256, 256)
img_cp = cp.asarray(img_np)
patch_size = 8
stride = 4
In [21]:
%timeit im2patch(img_np, patch_size, stride)
10 loops, best of 3: 13.6 ms per loop
In [22]:
%timeit im2patch(img_cp, patch_size, stride)
1 loops, best of 3: 145 ms per loop