PyTorch Resampling (based on scipy)¶

In [2]:

%matplotlib inline
import scipy, scipy.signal
from scipy import fftpack
import numpy as np
import torch
import matplotlib.pyplot as plt
import math

In [3]:

def specgram(sig, Fs, NFFT=256, figsize=(15,3), ax=None):
    if ax is None: ax = plt.figure(figsize=figsize).gca()
    _,_,_,im = ax.specgram(sig, scale='dB', Fs=Fs, NFFT=NFFT)
    plt.colorbar(im, ax=ax)

In [4]:

Q = 48000
P = 44100

offset = 2000
instfreq = np.exp(np.linspace(np.log(offset+100), np.log(offset+23900), 96000))-offset
deltaphase = 2*np.pi*instfreq/Q
cphase = np.cumsum(deltaphase)
sweep = np.sin(cphase)
specgram(sweep, P)
sweep_t = torch.tensor(sweep, dtype=torch.float32)

scipy.signal.resample¶

In [5]:

def resample_torch(x, num, dim=-1):
    dim = (x.dim() + dim) if dim < 0 else dim
    X = torch.rfft(x, 1, onesided=False)
    Nx = X.shape[dim]
    #print(f"x: {x.dtype}{x.shape}, X: {X.dtype}{X.shape}, Nx: {Nx}")

    sl = [slice(None)] * X.ndimension()
    newshape = list(X.shape)
    newshape[dim] = num
    #print(f"sl: {sl}, newshape: {newshape}")

    N = int(np.minimum(num, Nx))
    Y = torch.zeros(newshape, dtype=X.dtype, device=X.device)
    #print(f"N: {N}, Y:{Y.dtype}{Y.shape}")

    sl[dim] = slice(0, (N + 1) // 2)
    Y[sl] = X[sl]
    #print(f"Y[{sl}] = X[{sl}]")
    sl[dim] = slice(-(N - 1) // 2, None)
    Y[sl] = X[sl]
    #print(f"Y[{sl}] = X[{sl}]")

    if N % 2 == 0:  # special treatment if low number of points is even. So far we have set Y[-N/2]=X[-N/2]
        if N < Nx:  # if downsampling
            sl[dim] = slice(N//2, N//2+1,None)  # select the component at frequency N/2
            Y[sl] += X[sl] # add the component of X at N/2
            #print(f'Downsample: Y[{sl} += X[{sl}]')
        elif N < num:  # if upsampling
            sl[dim] = slice(num-N//2,num-N//2+1,None)  # select the component at frequency -N/2
            Y[sl] /= 2  # halve the component at -N/2
            #s = f'Y[{sl}] // 2'
            temp = Y[sl]
            sl[dim] = slice(N//2,N//2+1,None)  # select the component at +N/2
            Y[sl] = temp # set that equal to the component at -N/2
            #print(f'Upsample: Y[{sl}] = {s}')
            
    y = torch.irfft(Y, 1, onesided=False) * (float(num) / float(Nx))
    return y

In [ ]:

# TODO: Investigate difference here
sig_t = torch.tensor(sweep, dtype=torch.float32, device='cpu')[None,:]
for num in [11025, 22050, 44100, 54000, 19200]:
    fft_t = resample_torch(sig_t, num)
    np.testing.assert_allclose(fft_t[0,:].numpy(), scipy.signal.resample(sweep, num), err_msg=str(num))

In [7]:

specgram(resample_torch(sweep_t[None,:], P)[0,:] * 30, Fs=P)

In [9]:

specgram(scipy.signal.resample(sweep, P) * 30, Fs=P)

In [ ]:

scipy.signal.resample_poly¶

In [6]:

from scipy.signal._upfirdn import upfirdn as sp_upfirdn
from scipy.signal import resample_poly as sp_resample_poly, firwin as sp_firwin
import torch.nn.functional as F

The filter scipy uses is very large:

In [15]:

# From sp_resample_poly: Design a linear-phase low-pass FIR filter
max_rate = max(P, Q)
f_c = 1. / max_rate  # cutoff of FIR filter (rel. to Nyquist)
half_len = 10 * max_rate  # reasonable cutoff for our sinc-like function
orig_wfilt = sp_firwin(2 * half_len + 1, f_c, window=('kaiser', 5.0))
orig_wfilt.shape, orig_wfilt.dtype

Out[15]:

((960001,), dtype('float64'))

So We'll use this modified FIR filter instead of the scipy default. Not only is it much smaller, but it's also apparently betterthan the original.

In [18]:

# From http://signalsprocessed.blogspot.com/2017/05/resampling-in-python-electric-bugaloo.html
def resample_poly_filter(up, down, beta=5.0, L=16001, window=('kaiser', 5.0)):
    
    # *** this block STOLEN FROM scipy.signal.resample_poly ***
    # Determine our up and down factors
    # Use a rational approximation to save computation time on really long
    # signals
    g_ = math.gcd(up, down)
    up //= g_
    down //= g_
    max_rate = max(up, down)

    sfact = np.sqrt(1+(beta/np.pi)**2)
            
    # generate first filter attempt: with 6dB attenuation at f_c.
    filt = sp_firwin(L, 1/max_rate, window=window)
    
    N_FFT = 2**19
    NBINS = N_FFT/2+1
    paddedfilt = np.zeros(N_FFT)
    paddedfilt[:L] = filt
    ffilt = np.fft.rfft(paddedfilt)
    
    # now find the minimum between f_c and f_c+sqrt(1+(beta/pi)^2)/L
    bot = int(np.floor(NBINS/max_rate))
    top = int(np.ceil(NBINS*(1/max_rate + 2*sfact/L)))
    firstnull = (np.argmin(np.abs(ffilt[bot:top])) + bot)/NBINS
    
    # generate the proper shifted filter
    filt2 = sp_firwin(L, -firstnull+2/max_rate, window=window)
    
    return filt2

In [19]:

wfilt = resample_poly_filter(P, Q, L=2**16+1)
wfilt.shape

Out[19]:

(65537,)

In [20]:

wfilt_t = torch.tensor(wfilt, dtype=torch.float32)
wfilt_t.shape, wfilt_t.dtype

Out[20]:

(torch.Size([65537]), torch.float32)

In [21]:

to_t = lambda a: torch.tensor(a, dtype=torch.float32)

The heart of scipy's poly_resample is the upfirdn method in scipy.signal._upfirdn, with the cython implmentation in scipy.signal._upfirdn_apply doing the actual work. The resample_poly method creates the FIR filter if you pass a window function (which we won't do, passing in our own filter instead), and does some trimming of the results of upfirdn, I gather this is needed to make the FIR work correctly across various input lengths and up/down ratios. Here we copy those setup bits bits and create a resample_poly that will use an upfirdn function we provide so we can test various implementations. We also copy over the scipy.signal._upfirdn_apply._out_len method (converting from cython) which we'll need.

In [22]:

def ufd_out_len(n_h, n_in, up, down):
    nt = n_in + (n_h + (-n_h % up)) // up - 1
    nt *= up
    need = nt // down
    if nt % down > 0: need += 1
    return need

# Preprocessing from scipy's resample_poly
def resample_poly_pre(x, up, down, filt):
    g_ = math.gcd(up, down)
    up //= g_
    down //= g_
    if up == down == 1:
        return x
    n_x = x.shape[-1]
    n_out = n_x * up
    n_out = n_out // down + bool(n_out % down)
    
    if filt.ndimension() != 1: raise ValueError("Filter should be a 1d tensor")
    n_filt = len(filt)
    half_len = (n_filt-1) // 2
    h = filt * up
    
    # Zero-pad filter to put output samples at center
    n_pre_pad = (down - half_len % down)
    n_post_pad = 0
    n_pre_remove = (half_len + n_pre_pad) // down
    # We should rarely need to do this given our filter lengths...
    while ufd_out_len(n_filt + n_pre_pad + n_post_pad, n_x, up, down) < n_out + n_pre_remove:
        n_post_pad += 1
    h = F.pad(h, (n_pre_pad, n_post_pad))
    n_pre_remove_end = n_pre_remove + n_out
    remove = slice(n_pre_remove,n_pre_remove_end)
    return h,x,up,down,remove

# And a resample_poly that takes an upfirdn to use.
def resample_poly(x, up, down, filt, upfirdn_func):
    h,x,up,down,remove = resample_poly_pre(x, up, down, filt)
    y = upfirdn_func(h, x, up, down)
    return y[...,remove]

In [23]:

L = 4
tst1 = np.arange(1,9)
cf1 = np.arange(4,8)
tst1_t = torch.tensor(tst1, dtype=torch.float32)
cf1_t = torch.tensor(cf1, dtype=torch.float32)

Zero-stuffing¶

Out first implementation of upfirdn will implement interpolation (the upfir bit) through zero-stuffing (inserting up-1 zeros between each sample) and then filtering (i.e. convolving). If we then downsample (select every down-th sample) the output of this we have the full upfirdn function in scipy.

In [24]:

def upfir_zs(h, x, up): # Zero-padding version
    # Zero stuff x
    n_h = h.shape[0]
    n_x = x.shape[-1]
    x_zp = torch.zeros(n_x * up, dtype=x.dtype)
    x_zp.view(-1, up)[:,0] = x

    # FIR via Conv1d
    x_zp_c = x_zp[None,None,...] # Add batch and channel dimensions
    h_c = h.flip(0)[None,None,...] # Reversse coefficients and add out/in channel dimensions
    pad = n_h-1
    y = F.conv1d(x_zp_c, h_c, padding=pad)
    
    n_out = ufd_out_len(n_h, n_x, up, 1)
    return y[0,0,:n_out]

In [25]:

for n_cf in range(4,9):
    cf = np.arange(n_cf,n_cf*2 + 1)
    for n_x in range(1, 28, 3):
        x = np.arange(1, n_x+1)
        for up in range(2,10):
            y = sp_upfirdn(cf, x, up, 1)
            y_t = upfir_zs(to_t(cf), to_t(x), up)
            np.testing.assert_allclose(y, y_t.numpy(), err_msg=f'up={up}')
print('All good!')

All good!

In [26]:

def upfirdn_zs(h, x, up, down): # Zero-padding version
    y = upfir_zs(h, x, up)
    return y[::down]

In [27]:

for n_cf in range(4,9):
    cf = np.arange(n_cf,n_cf*2 + 1)
    for n_x in range(1, 28, 3):
        x = np.arange(1, n_x+1)
        for up in range(2,10):
            for down in range(1,10):
                y = sp_upfirdn(cf, x, up, down)
                y_t = upfirdn_zs(to_t(cf), to_t(x), up, down)
                np.testing.assert_allclose(y, y_t.numpy(), err_msg=f'up={up}, down={down}, n_cf={n_cf}, n_x={n_x}')
print('All good!')

All good!

In [29]:

sig_t = torch.tensor(sweep, dtype=torch.float32)
sig_t.shape, sig_t.dtype

Out[29]:

(torch.Size([96000]), torch.float32)

In [31]:

h_t,x_t,up,down,_ = resample_poly_pre(sig_t, P, Q, wfilt_t)
h_t.shape, x_t.shape, up, down

Out[31]:

(torch.Size([65569]), torch.Size([96000]), 147, 160)

In [32]:

n_h = h_t.shape[0]
n_x = x_t.shape[-1]
n_h, n_x, n_x * up
#x_zp = torch.zeros(n_x * up, dtype=x.dtype)
#x_zp.view(-1, up)[:,0] = x

Out[32]:

(65569, 96000, 14112000)

This crashes pytorch, don't think it likes convolving 14M input with 65K kernel

In [ ]:

#upfirdn_zs(h_t, x_t, up, down)

Polyphase¶

Here we perform polyphase interpolation. Instead of zero-stuffing we instead use the trick outlined in http://dspguru.com/dsp/faqs/multirate/interpolation/. Here we divide our n_h tap filter into up separate n_h/up filters, each one being a different phase. We apply each one to the entire input, and then interleave their outputs. This gives equivalent output to applying the original filter to the zero-stuffed input but with fewer computations and less memory.

Broken down:

In [34]:

up,n_h,n_x = 4,16,8
h_a = np.arange(1 ,n_h+1)*10
x_a = np.arange(n_x)

In [35]:

h_a, x_a

Out[35]:

(array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
        140, 150, 160]), array([0, 1, 2, 3, 4, 5, 6, 7]))

In [36]:

y_a = sp_upfirdn(h_a, x_a, up, 1)
y_a.shape, y_a

Out[36]:

((44,), array([   0.,    0.,    0.,    0.,   10.,   20.,   30.,   40.,   70.,
         100.,  130.,  160.,  220.,  280.,  340.,  400.,  500.,  600.,
         700.,  800.,  780.,  920., 1060., 1200., 1060., 1240., 1420.,
        1600., 1340., 1560., 1780., 2000., 1540., 1720., 1900., 2080.,
        1410., 1540., 1670., 1800.,  910.,  980., 1050., 1120.]))

In [37]:

h,x = [torch.tensor(a, dtype=torch.float32) for a in [h_a,x_a]]
h,x

Out[37]:

(tensor([ 10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100., 110., 120.,
         130., 140., 150., 160.]), tensor([0., 1., 2., 3., 4., 5., 6., 7.]))

In [38]:

n_h,n_x = len(h), len(x)
h_pad = F.pad(h, (0, -n_h % up)) # Pad to multiple of up
n_h, n_x, h_pad.shape

Out[38]:

(16, 8, torch.Size([16]))

In [39]:

h_pad

Out[39]:

tensor([ 10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100., 110., 120.,
        130., 140., 150., 160.])

In [40]:

n_h_pp = len(h_pad)//up
h_pp = h_pad.view(1, 1, n_h_pp, -1).transpose(0,1).flip(2)
h_pp.shape, n_h_pp

Out[40]:

(torch.Size([1, 1, 4, 4]), 4)

In [41]:

[h_pp[...,i] for i in range(up)]

Out[41]:

[tensor([[[130.,  90.,  50.,  10.]]]),
 tensor([[[140., 100.,  60.,  20.]]]),
 tensor([[[150., 110.,  70.,  30.]]]),
 tensor([[[160., 120.,  80.,  40.]]])]

In [42]:

#n_y_pp = ufd_out_len(n_h, n_x, up, down) # Length of a single phases output
n_y_pp = n_x + n_h_pp-1
n_x, n_y_pp

Out[42]:

(8, 11)

In [43]:

y_pp = torch.zeros(n_y_pp * up).view(-1,up)
y_pp.shape, y_pp[:,0].shape

Out[43]:

(torch.Size([11, 4]), torch.Size([11]))

In [44]:

for i in range(up):
    y_pp[:,i] = F.conv1d(x[None,None,:], h_pp[...,i], padding=n_h_pp-1)
y_pp.shape, y_pp

Out[44]:

(torch.Size([11, 4]), tensor([[   0.,    0.,    0.,    0.],
         [  10.,   20.,   30.,   40.],
         [  70.,  100.,  130.,  160.],
         [ 220.,  280.,  340.,  400.],
         [ 500.,  600.,  700.,  800.],
         [ 780.,  920., 1060., 1200.],
         [1060., 1240., 1420., 1600.],
         [1340., 1560., 1780., 2000.],
         [1540., 1720., 1900., 2080.],
         [1410., 1540., 1670., 1800.],
         [ 910.,  980., 1050., 1120.]]))

In [45]:

y = y_pp.view(-1)
y

Out[45]:

tensor([   0.,    0.,    0.,    0.,   10.,   20.,   30.,   40.,   70.,  100.,
         130.,  160.,  220.,  280.,  340.,  400.,  500.,  600.,  700.,  800.,
         780.,  920., 1060., 1200., 1060., 1240., 1420., 1600., 1340., 1560.,
        1780., 2000., 1540., 1720., 1900., 2080., 1410., 1540., 1670., 1800.,
         910.,  980., 1050., 1120.])

In [46]:

np.allclose(y.numpy(), y_a)

Out[46]:

True

That code, all together:

In [47]:

def upfir_conv(h, x, up):
    n_h,n_x = len(h),len(x)
    h_pad = F.pad(h, (0, -n_h % up)) # Pad to multiple of up
    n_h_pp = len(h_pad) // up # number of taps per phase
    h_pp = h_pad.view(1, 1, n_h_pp, -1).transpose(0,1).flip(2) # Filters per phase
    n_y_pp = n_x + n_h_pp-1 # Outputs per phase
    y_pp = torch.zeros(n_y_pp * up).view(-1,up)
    for i in range(up): # Apply phases
        y_pp[:,i] = F.conv1d(x[None,None,:], h_pp[...,i], padding=n_h_pp-1)
    y = y_pp.view(-1) # Interleave outputs
    return y

In [48]:

y = upfir_conv(to_t(h_a), to_t(x_a), up)
y.shape, y

Out[48]:

(torch.Size([44]),
 tensor([   0.,    0.,    0.,    0.,   10.,   20.,   30.,   40.,   70.,  100.,
          130.,  160.,  220.,  280.,  340.,  400.,  500.,  600.,  700.,  800.,
          780.,  920., 1060., 1200., 1060., 1240., 1420., 1600., 1340., 1560.,
         1780., 2000., 1540., 1720., 1900., 2080., 1410., 1540., 1670., 1800.,
          910.,  980., 1050., 1120.]))

In [49]:

np.allclose(y.numpy(), y_a)

Out[49]:

True

In [50]:

for n_h in range(3, 9):
    h = np.arange(1 ,n_h+1)*10
    for n_x in range(2, 52, 7):
        x = np.arange(n_x)
        for up in range(2,10):
            s = f"up={up}, n_h={n_h}, n_x={n_x}"
            #for down in range(1,10):
            y = sp_upfirdn(h, x, up, 1)
            try:
                y_t = upfir_conv(to_t(h), to_t(x), up) #, down)
            except Exception as exc:
                print(f"{s}: Exception {exc}")
                continue
            np.testing.assert_allclose(y, y_t.numpy(), err_msg=s)
print('All good!')

All good!

In [ ]:

In [51]:

def upfirdn_conv(h, x, up, down):
    y = upfir_conv(h, x, up)
    return y[...,::down]

In [52]:

up, down = 3,2
y_a = sp_upfirdn(h_a, x_a, up, down)
y_a.shape, y_a

Out[52]:

((20,), array([   0.,    0.,   20.,   60.,  120.,  240.,  400.,  600.,  900.,
        1260., 1500., 1700., 2200., 2160., 1920., 2280., 1740.,  980.,
        1120.,    0.]))

In [53]:

y = upfirdn_conv(to_t(h_a), to_t(x_a), up, down)
y.shape, y

Out[53]:

(torch.Size([20]),
 tensor([   0.,    0.,   20.,   60.,  120.,  240.,  400.,  600.,  900., 1260.,
         1500., 1700., 2200., 2160., 1920., 2280., 1740.,  980., 1120.,    0.]))

In [54]:

np.allclose(y.numpy(), y_a)

Out[54]:

True

In [55]:

for n_h in range(3, 9):
    h = np.arange(1 ,n_h+1)*10
    for n_x in range(2, 52, 7):
        x = np.arange(n_x)
        for up in range(2,10):
            s = f"up={up}, down={down}, n_h={n_h}, n_x={n_x}"
            for down in range(1,10):
                y = sp_upfirdn(h, x, up, down)
                try:
                    y_t = upfirdn_conv(to_t(h), to_t(x), up, down)
                except Exception as exc:
                    print(f"{s}: Exception {exc}")
                    continue
                np.testing.assert_allclose(y, y_t.numpy(), err_msg=s)
print('All good!')

All good!

In [56]:

y = resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_conv)
print(y.shape)
specgram(y.numpy()*30, Fs=P)

torch.Size([88200])

In [58]:

y_a = sp_resample_poly(sweep, P, Q, window=wfilt)
print(y_a.shape)
specgram(y_a*30, Fs=P)

(88200,)

In [60]:

# TODO: Check this is just calculation error, not bug
np.testing.assert_allclose(y.numpy(), y_a)

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-60-2904afc1b2a2> in <module>
      1 # TODO: Check this is just calculation error, not bug
----> 2 np.testing.assert_allclose(y.numpy(), y_a)

~/.conda/envs/fastai_audio-dev/lib/python3.7/site-packages/numpy/testing/_private/utils.py in assert_allclose(actual, desired, rtol, atol, equal_nan, err_msg, verbose)
   1450     header = 'Not equal to tolerance rtol=%g, atol=%g' % (rtol, atol)
   1451     assert_array_compare(compare, actual, desired, err_msg=str(err_msg),
-> 1452                          verbose=verbose, header=header, equal_nan=equal_nan)
   1453 
   1454 

~/.conda/envs/fastai_audio-dev/lib/python3.7/site-packages/numpy/testing/_private/utils.py in assert_array_compare(comparison, x, y, err_msg, verbose, header, precision, equal_nan, equal_inf)
    787                                 verbose=verbose, header=header,
    788                                 names=('x', 'y'), precision=precision)
--> 789             raise AssertionError(msg)
    790     except ValueError:
    791         import traceback

AssertionError: 
Not equal to tolerance rtol=1e-07, atol=0

(mismatch 51.30612244897959%)
 x: array([ 0.012801,  0.027505,  0.041481, ..., -0.193291,  0.232561,
       -0.293211], dtype=float32)
 y: array([ 0.012801,  0.027505,  0.041481, ..., -0.193291,  0.232561,
       -0.293211])

In [61]:

diffs = np.abs(y.numpy() - y_a)
diffs.max(), diffs.mean()

Out[61]:

(1.1749612726097425e-06, 8.254097168143291e-08)

In [62]:

rel_diffs = diffs / y_a
rel_diffs.max(), rel_diffs.mean()

Out[62]:

(1.0, 1.759182941034847e-05)

In [63]:

mm = (~np.isclose(y.numpy(), y_a)).sum()
print(f"{mm} mismatching samples, {mm/len(y_a):.1%}")

2367 mismatching samples, 2.7%

So those results look reasonable, looks like there might be a slight error given it's not allclose. But not obviously wrong in spectrogram.

upfirdn_dot¶

As can be seen in the performance section below this performs terribly. I think the issue is we are convolving our ~64K tap filter with all of the input samples. Then we discard all but 1/up of these results when we downsample. That's 159/160 that we discard in the 48000->44100 example we used. So, instead of using torch's conv1d, let's do that ourselves.

In [66]:

def upfirdn_dot(h, x, up, down):
    n_h,n_x = len(h), len(x)
    # Split filter into up phases, padding so even lengths, coefficients in each phase should be reversed
    hpad = F.pad(h, (0, -n_h%up))
    n_hpp = len(hpad)//up
    hpp = hpad.view(n_hpp, -1).t().flip(1)
     # Create padded input
    xpad = F.pad(x, (n_hpp-1,n_hpp-1))
    # Create output array and fill it
    n_y = ufd_out_len(n_h, n_x, up, down)
    y = torch.zeros(n_y, dtype=x.dtype, device=x.device)
    for i_y in range(n_y):
        i_x = i_y * down // up # Input idx at which to filter
        i_hpp = (i_y * down) % up # Filter phase to use - i_y*down gives non-downsampled index
        #print(f"y[{i_y:>2}] = h_pp[{i_hpp}] . xpad[{i_x:>2}:{i_x+n_hpp:>2}]")
        # Apply FIR to sample by calculating dot product
        y[i_y] = hpp[i_hpp,:].dot(xpad[i_x:i_x+n_hpp])
    return y

In [67]:

y = resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_dot)
print(y.shape)
specgram(y.numpy()*30, Fs=P)

torch.Size([88200])

In [ ]:

Performance¶

In [442]:

torch.cuda.get_device_name('cuda:0')

Out[442]:

'GeForce RTX 2070'

In [444]:

!cat /proc/cpuinfo | grep 'model name' | uniq

model name	: AMD Ryzen 5 2600 Six-Core Processor

resample¶

CPU (real-world)¶

I tested performance of the above resample_torch implementation, performed on CPU, on the entire urbansound8k dataset. This is 6.7Gb containing 8732 wav files, mostly 4 seconds each, though some are shorter. It's stored on a 250Gb Samsung 970 Evo Plus NVMe. Various sample rates:

44100.0     5370
48000.0     2502
96000.0      610
24000.0       82
16000.0       45
22050.0       44
11025.0       39
192000.0      17
8000.0        12
11024.0        7
32000.0        4

Here's the resuls. pt_resample is the implementation above, ta_load is torchaudio.load, all resample results include loading the file and resampling everything to 22050Hz. I ran sync && echo 3 > /proc/sys/vm/drop_caches before each to clear caches which seemed to work.

Processing 8732 files with 5743M samples:
ta_load: 13.9s total; mean: 1.59ms; std: 1.364ms; max: 27.7ms
sp_resample: 1694.2s total; mean: 194.03ms; std: 2015.948ms; max: 91911.7ms
sp_resample_poly: 75.4s total; mean: 8.64ms; std: 4.568ms; max: 60.4ms
sp_decimate2: 75.4s total; mean: 8.63ms; std: 4.440ms; max: 46.1ms
pt_resample: 43.7s total; mean: 5.01ms; std: 2.703ms; max: 46.2ms

Note that while scipy is using one core (~90% utilisation in top), PyTorch uses all cores (~580% utilisation). So this is a fair bit slower than SciPy's resample_poly when adjusting for this, but obviously a whole lot faster than SciPy's resample.

GPU (synthetic)¶

In [69]:

n_rep = 1000
for from_sr,to_sr in [(48000,44100), (48000,22050),(44100,22050),(22050,16000)]:
    times = [None]*n_rep
    n_from = 4*from_sr # 4 seconds
    n_to = 4*to_sr
    for i in range(n_rep):
        sig = torch.rand(1, n_from, dtype=torch.float32, device='cpu')
        start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
        start.record()
        sig_g = sig.to(device='cuda:0')
        y_g = resample_torch(sig_g, n_to)
        done.record()
        done.synchronize()
        times[i] = start.elapsed_time(done)
        del sig
    times = np.array(times)
    print(f"{from_sr}->{to_sr}: {times.mean():>2.3f}ms +/- {times.std():>1.4f}ms; {times.mean()/from_sr*1000*1000:>2.3f}us/KSamp")

48000->44100: 0.342ms +/- 0.0067ms; 7.128us/KSamp
48000->22050: 0.342ms +/- 0.0036ms; 7.125us/KSamp
44100->22050: 0.340ms +/- 0.0281ms; 7.704us/KSamp
22050->16000: 0.292ms +/- 0.0026ms; 13.239us/KSamp

And allowing operations across items to overlap:

In [70]:

n_rep = 1000
for from_sr,to_sr in [(48000,44100), (48000,22050),(44100,22050),(22050,16000)]:
    n_from = 4*from_sr # 4 seconds
    n_to = 4*to_sr
    start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
    start.record()
    for i in range(n_rep):
        sig_g = torch.rand(n_from, dtype=torch.float32, device='cuda:0')
        y_g = resample_torch(sig_g, n_to)
    done.record()
    done.synchronize()
    t = start.elapsed_time(done)
    print(f"{from_sr}->{to_sr}: {t/1000:.3f}s; {t/n_rep*1000:2.3f}us/item; {t/n_rep*1000/from_sr*1000:2.3f}us/KSamp")

48000->44100: 0.225s; 225.322us/item; 4.694us/KSamp
48000->22050: 0.225s; 224.782us/item; 4.683us/KSamp
44100->22050: 0.228s; 228.145us/item; 5.173us/KSamp
22050->16000: 0.222s; 222.457us/item; 10.089us/KSamp

So pretty fast.

resample_poly - convolution¶

Just synthetic tests here, so all the caveats.

In [71]:

%timeit resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_conv)

1.06 s ± 5.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [72]:

sweep_t.shape

Out[72]:

torch.Size([96000])

In [74]:

%timeit sp_resample_poly(sweep_t, P, Q, window=wfilt)

32 ms ± 47.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

So not great on CPU, and that's with pytorch using multiple threads versus scipy which only uses one. What about GPU.

In [75]:

Q,P=48000,16000
wf_t = torch.tensor(resample_poly_filter(P, Q, L=49153), dtype=torch.float32)

In [76]:

evts = [torch.cuda.Event(enable_timing=True) for _ in range(4)]
start,cp_filt,cp_sig,done = evts
start.record()
wfilt_g = wf_t.to(device='cuda:0')
cp_filt.wait()
cp_filt.record()
sig_g = sweep_t.to(device='cuda:0')
cp_sig.wait()
cp_sig.record()
y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
done.record()
done.synchronize()
cp_filt_t,cp_sig_t,rs_t = [evts[i].elapsed_time(evts[i+1]) for i in range(len(evts)-1)]
print(f"Copy filer: {cp_filt_t:4.3f}ms; Copy signal: {cp_sig_t:4.3f}ms; Resample: {rs_t:4.3f}ms")
del sig_g, wfilt_g

Copy filer: 0.875ms; Copy signal: 0.360ms; Resample: 448.299ms

In [77]:

n_rep = 50
times = [None]*n_rep
wfilt_g = wf_t.to(device='cuda:0')
for i in range(n_rep):
    sig = torch.rand(4*Q, dtype=torch.float32, device='cpu')
    start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
    start.record()
    sig_g = sig.to(device='cuda:0')
    y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
    done.record()
    done.synchronize()
    times[i] = start.elapsed_time(done)
times = np.array(times)
print(f"{times.mean():3.1f}ms +/- {times.std():.3f}ms")

179.8ms +/- 9.918ms

In [78]:

n_rep = 5
wfilt_g = wf_t.to(device='cuda:0')
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
for i in range(n_rep):
    sig_g = torch.rand(96000, dtype=torch.float32, device='cuda:0')
    y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
done.record()
done.synchronize()
t = start.elapsed_time(done)
print(f"{t/1000:.3f}s; {t/n_rep:.3f}ms/item")

0.532s; 106.418ms/item

So, it doesn't perform very well. We might get faster than this by using some explicit parallelism through torch.cuda.streams (or I think multiple threads get around some stream synchronisation issues so doing this in a databunch with num_workers>1 might help.

Think this might be inherent to the way of implementing it. We convolve our 96000 input samples with a ~65000 sample kernel. That's a lot of FLOPs. We then throw out 159/160 samples (decimating by down), so a lot of wasted calculations. I think that scipy's resample_poly doesn't calculate those unneeded samples (but I'm not quite sure). We can't do this with our torch.nn.conv1d implementation. Note we can't use stride as we possibly could in a non-polyphase implementation as we need to stride over the interleaved results, not each phases output, and given we reduce our up/downs to minimise our up there is no common factor of up and down.

On the filter:

In [90]:

wf_t = torch.tensor(resample_poly_filter(P, Q, L=8001), dtype=torch.float32)
n_rep = 5
wfilt_g = wf_t.to(device='cuda:0')
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
for i in range(n_rep):
    sig_g = torch.rand(96000, dtype=torch.float32, device='cuda:0')
    y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
done.record()
done.synchronize()
t = start.elapsed_time(done)
print(f"{t/1000:.3f}s; {t/n_rep:.3f}ms/item")

0.260s; 52.055ms/item

A fair bit better, but still slower than convolution.

resample_poly - dot product¶

Removing the uneeded calculations:

In [79]:

%timeit resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_dot)

1.27 s ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [133]:

sweep_t.shape

Out[133]:

torch.Size([96000])

In [80]:

%timeit sp_resample_poly(sweep, P, Q, window=wfilt)

3.51 s ± 180 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [182]:

wf_t = torch.tensor(resample_poly_filter(P, Q, L=1001), dtype=torch.float32)

In [183]:

n_rep = 5
times = [None]*n_rep
wfilt_g = wf_t.to(device='cuda:0')
sig = torch.rand(96000, dtype=torch.float32, device='cuda:0')
for i in range(n_rep):
    start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
    start.record()
    y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_dot)
    done.record()
    done.synchronize()
    times[i] = start.elapsed_time(done)
times = np.array(times)
print(f"{times.mean():3.1f}ms +/- {times.std():.3f}ms")

5272.7ms +/- 78.641ms

Ouch, a whole lot worse. Think the sheer number of kernels we use destoys performance.

In [62]:

torch.cuda.get_device_properties('cuda:0')

Out[62]:

_CudaDeviceProperties(name='GeForce RTX 2070', major=7, minor=5, total_memory=7979MB, multi_processor_count=36)

In [116]:

torch.arange(3).dot(torch.arange(1,4))

Out[116]:

tensor(8)

In [171]:

Out[171]:

(65537,)

Filter selection¶

In [379]:

Q,P=48000,44100
up = P/math.gcd(P,Q)
down = Q/math.gcd(P,Q)
N = 64*1024 + 1
wfilt = resample_poly_filter(P, Q, L=2**16+1)
wfilt_t = torch.tensor(wfilt, dtype=torch.float32)
up,down,wfilt.shape

Out[379]:

(147.0, 160.0, (65537,))

Original scipy¶

In [8]:

specgram(sp_resample_poly(sweep, Q, P)*30, Fs=P)

Modified resample_poly_filter¶

In [380]:

specgram(sp_resample_poly(sweep, Q, P, window=wfilt)*30, Fs=P)

Filter order¶

In [245]:

nts = [2001,4001,8001,16001,32001,49153,65537]
fig,axs = plt.subplots(len(nks), 1, figsize=(15,4*len(nks)))
for nt,ax in zip(nts,axs):
    wf = resample_poly_filter(P, Q, L=nt)
    y = sp_resample_poly(sweep, P, Q, window=wf)
    specgram(y*30, Fs=P, ax=ax)
    ax.set_title(f'N={nt}')

Frequency response¶

In [312]:

def freqresp(filt, Q, P, frng=(0,1.25), N_freq=1024):
    '''Get the frequency response of a resampling FIR across the `frng` frequency range (expressed in terms of Fs)'''
    # The filter operates on the updampled signal
    # Frequencies are relative to this such that Pi is the nyquist limit
    nyq_us = Q*(P/math.gcd(P,Q))/2
    freqs = np.linspace(frng[0]*np.pi/down, frng[1]*np.pi/down, N_freq, endpoint=False) # pi/down will be the cutoff freq when we downsample
    w,h = scipy.signal.freqz(wfilt, worN=freqs)
    return (w/np.pi)*nyq_us, np.absolute(h)
w,h = freqresp(wfilt, Q, P)
plt.plot(w, h, linewidth=2);

Deviation from ideal¶

In [387]:

# Deviation from ideal based on https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.kaiserord.html#scipy.signal.kaiserord
w,h = freqresp(wfilt, Q, P, frng=(0.5,1.5))
ideal = w < P/2 # Ideal filter
deviation = np.abs(h - ideal)
plt.figure(figsize=(10,4))
plt.plot(w, 20*np.log10(np.abs(deviation)))
plt.axhline(-65, color='r', ls='--', alpha=0.3)
plt.ylim(-90, -45);

In [388]:

wf = resample_poly_filter(P, Q, L=2**16+1, window=('blackman'))
w,h = freqresp(wf, Q, P, frng=(0.5,1.5))
ideal = w < P/2 # Ideal filter
deviation = np.abs(h - ideal)
plt.figure(figsize=(10,4))
plt.plot(w, 20*np.log10(np.abs(deviation)))
plt.axhline(-65, color='r', ls='--', alpha=0.3)
plt.ylim(-90, -45);

In [320]:

nts = [2001,4001,8001,16001,32001,49153,65537]
fig,axs = plt.subplots(len(nks), 1, figsize=(9,4*len(nks)))
for nt,ax in zip(nts,axs):
    wf = resample_poly_filter(P, Q, L=nt)
    w,h = freqresp(wf, Q, P, frng=(0.5,1.5))
    ideal = w < P/2 # Ideal filter
    deviation = np.abs(h - ideal)
    ax.plot(w, 20*np.log10(np.abs(deviation)))
    ax.axhline(-65, color='r', ls='--', alpha=0.3)
    ax.set_ylim(-90, -45);
    ax.set_title(f'N={nt}')

In [ ]: