%matplotlib inline
import scipy, scipy.signal
from scipy import fftpack
import numpy as np
import torch
import matplotlib.pyplot as plt
import math
def specgram(sig, Fs, NFFT=256, figsize=(15,3), ax=None):
if ax is None: ax = plt.figure(figsize=figsize).gca()
_,_,_,im = ax.specgram(sig, scale='dB', Fs=Fs, NFFT=NFFT)
plt.colorbar(im, ax=ax)
Q = 48000
P = 44100
offset = 2000
instfreq = np.exp(np.linspace(np.log(offset+100), np.log(offset+23900), 96000))-offset
deltaphase = 2*np.pi*instfreq/Q
cphase = np.cumsum(deltaphase)
sweep = np.sin(cphase)
specgram(sweep, P)
sweep_t = torch.tensor(sweep, dtype=torch.float32)
def resample_torch(x, num, dim=-1):
dim = (x.dim() + dim) if dim < 0 else dim
X = torch.rfft(x, 1, onesided=False)
Nx = X.shape[dim]
#print(f"x: {x.dtype}{x.shape}, X: {X.dtype}{X.shape}, Nx: {Nx}")
sl = [slice(None)] * X.ndimension()
newshape = list(X.shape)
newshape[dim] = num
#print(f"sl: {sl}, newshape: {newshape}")
N = int(np.minimum(num, Nx))
Y = torch.zeros(newshape, dtype=X.dtype, device=X.device)
#print(f"N: {N}, Y:{Y.dtype}{Y.shape}")
sl[dim] = slice(0, (N + 1) // 2)
Y[sl] = X[sl]
#print(f"Y[{sl}] = X[{sl}]")
sl[dim] = slice(-(N - 1) // 2, None)
Y[sl] = X[sl]
#print(f"Y[{sl}] = X[{sl}]")
if N % 2 == 0: # special treatment if low number of points is even. So far we have set Y[-N/2]=X[-N/2]
if N < Nx: # if downsampling
sl[dim] = slice(N//2, N//2+1,None) # select the component at frequency N/2
Y[sl] += X[sl] # add the component of X at N/2
#print(f'Downsample: Y[{sl} += X[{sl}]')
elif N < num: # if upsampling
sl[dim] = slice(num-N//2,num-N//2+1,None) # select the component at frequency -N/2
Y[sl] /= 2 # halve the component at -N/2
#s = f'Y[{sl}] // 2'
temp = Y[sl]
sl[dim] = slice(N//2,N//2+1,None) # select the component at +N/2
Y[sl] = temp # set that equal to the component at -N/2
#print(f'Upsample: Y[{sl}] = {s}')
y = torch.irfft(Y, 1, onesided=False) * (float(num) / float(Nx))
return y
# TODO: Investigate difference here
sig_t = torch.tensor(sweep, dtype=torch.float32, device='cpu')[None,:]
for num in [11025, 22050, 44100, 54000, 19200]:
fft_t = resample_torch(sig_t, num)
np.testing.assert_allclose(fft_t[0,:].numpy(), scipy.signal.resample(sweep, num), err_msg=str(num))
specgram(resample_torch(sweep_t[None,:], P)[0,:] * 30, Fs=P)
specgram(scipy.signal.resample(sweep, P) * 30, Fs=P)
from scipy.signal._upfirdn import upfirdn as sp_upfirdn
from scipy.signal import resample_poly as sp_resample_poly, firwin as sp_firwin
import torch.nn.functional as F
The filter scipy uses is very large:
# From sp_resample_poly: Design a linear-phase low-pass FIR filter
max_rate = max(P, Q)
f_c = 1. / max_rate # cutoff of FIR filter (rel. to Nyquist)
half_len = 10 * max_rate # reasonable cutoff for our sinc-like function
orig_wfilt = sp_firwin(2 * half_len + 1, f_c, window=('kaiser', 5.0))
orig_wfilt.shape, orig_wfilt.dtype
((960001,), dtype('float64'))
So We'll use this modified FIR filter instead of the scipy default. Not only is it much smaller, but it's also apparently betterthan the original.
# From http://signalsprocessed.blogspot.com/2017/05/resampling-in-python-electric-bugaloo.html
def resample_poly_filter(up, down, beta=5.0, L=16001, window=('kaiser', 5.0)):
# *** this block STOLEN FROM scipy.signal.resample_poly ***
# Determine our up and down factors
# Use a rational approximation to save computation time on really long
# signals
g_ = math.gcd(up, down)
up //= g_
down //= g_
max_rate = max(up, down)
sfact = np.sqrt(1+(beta/np.pi)**2)
# generate first filter attempt: with 6dB attenuation at f_c.
filt = sp_firwin(L, 1/max_rate, window=window)
N_FFT = 2**19
NBINS = N_FFT/2+1
paddedfilt = np.zeros(N_FFT)
paddedfilt[:L] = filt
ffilt = np.fft.rfft(paddedfilt)
# now find the minimum between f_c and f_c+sqrt(1+(beta/pi)^2)/L
bot = int(np.floor(NBINS/max_rate))
top = int(np.ceil(NBINS*(1/max_rate + 2*sfact/L)))
firstnull = (np.argmin(np.abs(ffilt[bot:top])) + bot)/NBINS
# generate the proper shifted filter
filt2 = sp_firwin(L, -firstnull+2/max_rate, window=window)
return filt2
wfilt = resample_poly_filter(P, Q, L=2**16+1)
wfilt.shape
(65537,)
wfilt_t = torch.tensor(wfilt, dtype=torch.float32)
wfilt_t.shape, wfilt_t.dtype
(torch.Size([65537]), torch.float32)
to_t = lambda a: torch.tensor(a, dtype=torch.float32)
The heart of scipy's poly_resample
is the upfirdn
method in scipy.signal._upfirdn
, with the cython implmentation in scipy.signal._upfirdn_apply
doing the actual work. The resample_poly
method creates the FIR filter if you pass a window function (which we won't do, passing in our own filter instead), and does some trimming of the results of upfirdn
, I gather this is needed to make the FIR work correctly across various input lengths and up/down ratios. Here we copy those setup bits bits and create a resample_poly
that will use an upfirdn
function we provide so we can test various implementations. We also copy over the scipy.signal._upfirdn_apply._out_len
method (converting from cython) which we'll need.
def ufd_out_len(n_h, n_in, up, down):
nt = n_in + (n_h + (-n_h % up)) // up - 1
nt *= up
need = nt // down
if nt % down > 0: need += 1
return need
# Preprocessing from scipy's resample_poly
def resample_poly_pre(x, up, down, filt):
g_ = math.gcd(up, down)
up //= g_
down //= g_
if up == down == 1:
return x
n_x = x.shape[-1]
n_out = n_x * up
n_out = n_out // down + bool(n_out % down)
if filt.ndimension() != 1: raise ValueError("Filter should be a 1d tensor")
n_filt = len(filt)
half_len = (n_filt-1) // 2
h = filt * up
# Zero-pad filter to put output samples at center
n_pre_pad = (down - half_len % down)
n_post_pad = 0
n_pre_remove = (half_len + n_pre_pad) // down
# We should rarely need to do this given our filter lengths...
while ufd_out_len(n_filt + n_pre_pad + n_post_pad, n_x, up, down) < n_out + n_pre_remove:
n_post_pad += 1
h = F.pad(h, (n_pre_pad, n_post_pad))
n_pre_remove_end = n_pre_remove + n_out
remove = slice(n_pre_remove,n_pre_remove_end)
return h,x,up,down,remove
# And a resample_poly that takes an upfirdn to use.
def resample_poly(x, up, down, filt, upfirdn_func):
h,x,up,down,remove = resample_poly_pre(x, up, down, filt)
y = upfirdn_func(h, x, up, down)
return y[...,remove]
L = 4
tst1 = np.arange(1,9)
cf1 = np.arange(4,8)
tst1_t = torch.tensor(tst1, dtype=torch.float32)
cf1_t = torch.tensor(cf1, dtype=torch.float32)
Out first implementation of upfirdn
will implement interpolation (the upfir
bit) through zero-stuffing (inserting up-1
zeros between each sample) and then filtering (i.e. convolving). If we then downsample (select every down
-th sample) the output of this we have the full upfirdn
function in scipy.
def upfir_zs(h, x, up): # Zero-padding version
# Zero stuff x
n_h = h.shape[0]
n_x = x.shape[-1]
x_zp = torch.zeros(n_x * up, dtype=x.dtype)
x_zp.view(-1, up)[:,0] = x
# FIR via Conv1d
x_zp_c = x_zp[None,None,...] # Add batch and channel dimensions
h_c = h.flip(0)[None,None,...] # Reversse coefficients and add out/in channel dimensions
pad = n_h-1
y = F.conv1d(x_zp_c, h_c, padding=pad)
n_out = ufd_out_len(n_h, n_x, up, 1)
return y[0,0,:n_out]
for n_cf in range(4,9):
cf = np.arange(n_cf,n_cf*2 + 1)
for n_x in range(1, 28, 3):
x = np.arange(1, n_x+1)
for up in range(2,10):
y = sp_upfirdn(cf, x, up, 1)
y_t = upfir_zs(to_t(cf), to_t(x), up)
np.testing.assert_allclose(y, y_t.numpy(), err_msg=f'up={up}')
print('All good!')
All good!
def upfirdn_zs(h, x, up, down): # Zero-padding version
y = upfir_zs(h, x, up)
return y[::down]
for n_cf in range(4,9):
cf = np.arange(n_cf,n_cf*2 + 1)
for n_x in range(1, 28, 3):
x = np.arange(1, n_x+1)
for up in range(2,10):
for down in range(1,10):
y = sp_upfirdn(cf, x, up, down)
y_t = upfirdn_zs(to_t(cf), to_t(x), up, down)
np.testing.assert_allclose(y, y_t.numpy(), err_msg=f'up={up}, down={down}, n_cf={n_cf}, n_x={n_x}')
print('All good!')
All good!
sig_t = torch.tensor(sweep, dtype=torch.float32)
sig_t.shape, sig_t.dtype
(torch.Size([96000]), torch.float32)
h_t,x_t,up,down,_ = resample_poly_pre(sig_t, P, Q, wfilt_t)
h_t.shape, x_t.shape, up, down
(torch.Size([65569]), torch.Size([96000]), 147, 160)
n_h = h_t.shape[0]
n_x = x_t.shape[-1]
n_h, n_x, n_x * up
#x_zp = torch.zeros(n_x * up, dtype=x.dtype)
#x_zp.view(-1, up)[:,0] = x
(65569, 96000, 14112000)
This crashes pytorch, don't think it likes convolving 14M input with 65K kernel
#upfirdn_zs(h_t, x_t, up, down)
Here we perform polyphase interpolation. Instead of zero-stuffing we instead use the trick outlined in http://dspguru.com/dsp/faqs/multirate/interpolation/. Here we divide our n_h
tap filter into up
separate n_h/up
filters, each one being a different phase. We apply each one to the entire input, and then interleave their outputs. This gives equivalent output to applying the original filter to the zero-stuffed input but with fewer computations and less memory.
Broken down:
up,n_h,n_x = 4,16,8
h_a = np.arange(1 ,n_h+1)*10
x_a = np.arange(n_x)
h_a, x_a
(array([ 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160]), array([0, 1, 2, 3, 4, 5, 6, 7]))
y_a = sp_upfirdn(h_a, x_a, up, 1)
y_a.shape, y_a
((44,), array([ 0., 0., 0., 0., 10., 20., 30., 40., 70., 100., 130., 160., 220., 280., 340., 400., 500., 600., 700., 800., 780., 920., 1060., 1200., 1060., 1240., 1420., 1600., 1340., 1560., 1780., 2000., 1540., 1720., 1900., 2080., 1410., 1540., 1670., 1800., 910., 980., 1050., 1120.]))
h,x = [torch.tensor(a, dtype=torch.float32) for a in [h_a,x_a]]
h,x
(tensor([ 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., 110., 120., 130., 140., 150., 160.]), tensor([0., 1., 2., 3., 4., 5., 6., 7.]))
n_h,n_x = len(h), len(x)
h_pad = F.pad(h, (0, -n_h % up)) # Pad to multiple of up
n_h, n_x, h_pad.shape
(16, 8, torch.Size([16]))
h_pad
tensor([ 10., 20., 30., 40., 50., 60., 70., 80., 90., 100., 110., 120., 130., 140., 150., 160.])
n_h_pp = len(h_pad)//up
h_pp = h_pad.view(1, 1, n_h_pp, -1).transpose(0,1).flip(2)
h_pp.shape, n_h_pp
(torch.Size([1, 1, 4, 4]), 4)
[h_pp[...,i] for i in range(up)]
[tensor([[[130., 90., 50., 10.]]]), tensor([[[140., 100., 60., 20.]]]), tensor([[[150., 110., 70., 30.]]]), tensor([[[160., 120., 80., 40.]]])]
#n_y_pp = ufd_out_len(n_h, n_x, up, down) # Length of a single phases output
n_y_pp = n_x + n_h_pp-1
n_x, n_y_pp
(8, 11)
y_pp = torch.zeros(n_y_pp * up).view(-1,up)
y_pp.shape, y_pp[:,0].shape
(torch.Size([11, 4]), torch.Size([11]))
for i in range(up):
y_pp[:,i] = F.conv1d(x[None,None,:], h_pp[...,i], padding=n_h_pp-1)
y_pp.shape, y_pp
(torch.Size([11, 4]), tensor([[ 0., 0., 0., 0.], [ 10., 20., 30., 40.], [ 70., 100., 130., 160.], [ 220., 280., 340., 400.], [ 500., 600., 700., 800.], [ 780., 920., 1060., 1200.], [1060., 1240., 1420., 1600.], [1340., 1560., 1780., 2000.], [1540., 1720., 1900., 2080.], [1410., 1540., 1670., 1800.], [ 910., 980., 1050., 1120.]]))
y = y_pp.view(-1)
y
tensor([ 0., 0., 0., 0., 10., 20., 30., 40., 70., 100., 130., 160., 220., 280., 340., 400., 500., 600., 700., 800., 780., 920., 1060., 1200., 1060., 1240., 1420., 1600., 1340., 1560., 1780., 2000., 1540., 1720., 1900., 2080., 1410., 1540., 1670., 1800., 910., 980., 1050., 1120.])
np.allclose(y.numpy(), y_a)
True
That code, all together:
def upfir_conv(h, x, up):
n_h,n_x = len(h),len(x)
h_pad = F.pad(h, (0, -n_h % up)) # Pad to multiple of up
n_h_pp = len(h_pad) // up # number of taps per phase
h_pp = h_pad.view(1, 1, n_h_pp, -1).transpose(0,1).flip(2) # Filters per phase
n_y_pp = n_x + n_h_pp-1 # Outputs per phase
y_pp = torch.zeros(n_y_pp * up).view(-1,up)
for i in range(up): # Apply phases
y_pp[:,i] = F.conv1d(x[None,None,:], h_pp[...,i], padding=n_h_pp-1)
y = y_pp.view(-1) # Interleave outputs
return y
y = upfir_conv(to_t(h_a), to_t(x_a), up)
y.shape, y
(torch.Size([44]), tensor([ 0., 0., 0., 0., 10., 20., 30., 40., 70., 100., 130., 160., 220., 280., 340., 400., 500., 600., 700., 800., 780., 920., 1060., 1200., 1060., 1240., 1420., 1600., 1340., 1560., 1780., 2000., 1540., 1720., 1900., 2080., 1410., 1540., 1670., 1800., 910., 980., 1050., 1120.]))
np.allclose(y.numpy(), y_a)
True
for n_h in range(3, 9):
h = np.arange(1 ,n_h+1)*10
for n_x in range(2, 52, 7):
x = np.arange(n_x)
for up in range(2,10):
s = f"up={up}, n_h={n_h}, n_x={n_x}"
#for down in range(1,10):
y = sp_upfirdn(h, x, up, 1)
try:
y_t = upfir_conv(to_t(h), to_t(x), up) #, down)
except Exception as exc:
print(f"{s}: Exception {exc}")
continue
np.testing.assert_allclose(y, y_t.numpy(), err_msg=s)
print('All good!')
All good!
def upfirdn_conv(h, x, up, down):
y = upfir_conv(h, x, up)
return y[...,::down]
up, down = 3,2
y_a = sp_upfirdn(h_a, x_a, up, down)
y_a.shape, y_a
((20,), array([ 0., 0., 20., 60., 120., 240., 400., 600., 900., 1260., 1500., 1700., 2200., 2160., 1920., 2280., 1740., 980., 1120., 0.]))
y = upfirdn_conv(to_t(h_a), to_t(x_a), up, down)
y.shape, y
(torch.Size([20]), tensor([ 0., 0., 20., 60., 120., 240., 400., 600., 900., 1260., 1500., 1700., 2200., 2160., 1920., 2280., 1740., 980., 1120., 0.]))
np.allclose(y.numpy(), y_a)
True
for n_h in range(3, 9):
h = np.arange(1 ,n_h+1)*10
for n_x in range(2, 52, 7):
x = np.arange(n_x)
for up in range(2,10):
s = f"up={up}, down={down}, n_h={n_h}, n_x={n_x}"
for down in range(1,10):
y = sp_upfirdn(h, x, up, down)
try:
y_t = upfirdn_conv(to_t(h), to_t(x), up, down)
except Exception as exc:
print(f"{s}: Exception {exc}")
continue
np.testing.assert_allclose(y, y_t.numpy(), err_msg=s)
print('All good!')
All good!
y = resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_conv)
print(y.shape)
specgram(y.numpy()*30, Fs=P)
torch.Size([88200])
y_a = sp_resample_poly(sweep, P, Q, window=wfilt)
print(y_a.shape)
specgram(y_a*30, Fs=P)
(88200,)
# TODO: Check this is just calculation error, not bug
np.testing.assert_allclose(y.numpy(), y_a)
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) <ipython-input-60-2904afc1b2a2> in <module> 1 # TODO: Check this is just calculation error, not bug ----> 2 np.testing.assert_allclose(y.numpy(), y_a) ~/.conda/envs/fastai_audio-dev/lib/python3.7/site-packages/numpy/testing/_private/utils.py in assert_allclose(actual, desired, rtol, atol, equal_nan, err_msg, verbose) 1450 header = 'Not equal to tolerance rtol=%g, atol=%g' % (rtol, atol) 1451 assert_array_compare(compare, actual, desired, err_msg=str(err_msg), -> 1452 verbose=verbose, header=header, equal_nan=equal_nan) 1453 1454 ~/.conda/envs/fastai_audio-dev/lib/python3.7/site-packages/numpy/testing/_private/utils.py in assert_array_compare(comparison, x, y, err_msg, verbose, header, precision, equal_nan, equal_inf) 787 verbose=verbose, header=header, 788 names=('x', 'y'), precision=precision) --> 789 raise AssertionError(msg) 790 except ValueError: 791 import traceback AssertionError: Not equal to tolerance rtol=1e-07, atol=0 (mismatch 51.30612244897959%) x: array([ 0.012801, 0.027505, 0.041481, ..., -0.193291, 0.232561, -0.293211], dtype=float32) y: array([ 0.012801, 0.027505, 0.041481, ..., -0.193291, 0.232561, -0.293211])
diffs = np.abs(y.numpy() - y_a)
diffs.max(), diffs.mean()
(1.1749612726097425e-06, 8.254097168143291e-08)
rel_diffs = diffs / y_a
rel_diffs.max(), rel_diffs.mean()
(1.0, 1.759182941034847e-05)
mm = (~np.isclose(y.numpy(), y_a)).sum()
print(f"{mm} mismatching samples, {mm/len(y_a):.1%}")
2367 mismatching samples, 2.7%
So those results look reasonable, looks like there might be a slight error given it's not allclose
. But not obviously wrong in spectrogram.
As can be seen in the performance section below this performs terribly. I think the issue is we are convolving our ~64K tap filter with all of the input samples. Then we discard all but 1/up
of these results when we downsample. That's 159/160
that we discard in the 48000->44100 example we used. So, instead of using torch's conv1d
, let's do that ourselves.
def upfirdn_dot(h, x, up, down):
n_h,n_x = len(h), len(x)
# Split filter into up phases, padding so even lengths, coefficients in each phase should be reversed
hpad = F.pad(h, (0, -n_h%up))
n_hpp = len(hpad)//up
hpp = hpad.view(n_hpp, -1).t().flip(1)
# Create padded input
xpad = F.pad(x, (n_hpp-1,n_hpp-1))
# Create output array and fill it
n_y = ufd_out_len(n_h, n_x, up, down)
y = torch.zeros(n_y, dtype=x.dtype, device=x.device)
for i_y in range(n_y):
i_x = i_y * down // up # Input idx at which to filter
i_hpp = (i_y * down) % up # Filter phase to use - i_y*down gives non-downsampled index
#print(f"y[{i_y:>2}] = h_pp[{i_hpp}] . xpad[{i_x:>2}:{i_x+n_hpp:>2}]")
# Apply FIR to sample by calculating dot product
y[i_y] = hpp[i_hpp,:].dot(xpad[i_x:i_x+n_hpp])
return y
y = resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_dot)
print(y.shape)
specgram(y.numpy()*30, Fs=P)
torch.Size([88200])
torch.cuda.get_device_name('cuda:0')
'GeForce RTX 2070'
!cat /proc/cpuinfo | grep 'model name' | uniq
model name : AMD Ryzen 5 2600 Six-Core Processor
I tested performance of the above resample_torch
implementation, performed on CPU, on the entire urbansound8k dataset. This is 6.7Gb containing 8732 wav files, mostly 4 seconds each, though some are shorter. It's stored on a 250Gb Samsung 970 Evo Plus NVMe. Various sample rates:
44100.0 5370
48000.0 2502
96000.0 610
24000.0 82
16000.0 45
22050.0 44
11025.0 39
192000.0 17
8000.0 12
11024.0 7
32000.0 4
Here's the resuls. pt_resample is the implementation above, ta_load is torchaudio.load, all resample results include loading the file and resampling everything to 22050Hz. I ran sync && echo 3 > /proc/sys/vm/drop_caches
before each to clear caches which seemed to work.
Processing 8732 files with 5743M samples:
ta_load: 13.9s total; mean: 1.59ms; std: 1.364ms; max: 27.7ms
sp_resample: 1694.2s total; mean: 194.03ms; std: 2015.948ms; max: 91911.7ms
sp_resample_poly: 75.4s total; mean: 8.64ms; std: 4.568ms; max: 60.4ms
sp_decimate2: 75.4s total; mean: 8.63ms; std: 4.440ms; max: 46.1ms
pt_resample: 43.7s total; mean: 5.01ms; std: 2.703ms; max: 46.2ms
Note that while scipy is using one core (~90% utilisation in top), PyTorch uses all cores (~580% utilisation). So this is a fair bit slower than SciPy's resample_poly
when adjusting for this, but obviously a whole lot faster than SciPy's resample
.
n_rep = 1000
for from_sr,to_sr in [(48000,44100), (48000,22050),(44100,22050),(22050,16000)]:
times = [None]*n_rep
n_from = 4*from_sr # 4 seconds
n_to = 4*to_sr
for i in range(n_rep):
sig = torch.rand(1, n_from, dtype=torch.float32, device='cpu')
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
sig_g = sig.to(device='cuda:0')
y_g = resample_torch(sig_g, n_to)
done.record()
done.synchronize()
times[i] = start.elapsed_time(done)
del sig
times = np.array(times)
print(f"{from_sr}->{to_sr}: {times.mean():>2.3f}ms +/- {times.std():>1.4f}ms; {times.mean()/from_sr*1000*1000:>2.3f}us/KSamp")
48000->44100: 0.342ms +/- 0.0067ms; 7.128us/KSamp 48000->22050: 0.342ms +/- 0.0036ms; 7.125us/KSamp 44100->22050: 0.340ms +/- 0.0281ms; 7.704us/KSamp 22050->16000: 0.292ms +/- 0.0026ms; 13.239us/KSamp
And allowing operations across items to overlap:
n_rep = 1000
for from_sr,to_sr in [(48000,44100), (48000,22050),(44100,22050),(22050,16000)]:
n_from = 4*from_sr # 4 seconds
n_to = 4*to_sr
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
for i in range(n_rep):
sig_g = torch.rand(n_from, dtype=torch.float32, device='cuda:0')
y_g = resample_torch(sig_g, n_to)
done.record()
done.synchronize()
t = start.elapsed_time(done)
print(f"{from_sr}->{to_sr}: {t/1000:.3f}s; {t/n_rep*1000:2.3f}us/item; {t/n_rep*1000/from_sr*1000:2.3f}us/KSamp")
48000->44100: 0.225s; 225.322us/item; 4.694us/KSamp 48000->22050: 0.225s; 224.782us/item; 4.683us/KSamp 44100->22050: 0.228s; 228.145us/item; 5.173us/KSamp 22050->16000: 0.222s; 222.457us/item; 10.089us/KSamp
So pretty fast.
Just synthetic tests here, so all the caveats.
%timeit resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_conv)
1.06 s ± 5.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
sweep_t.shape
torch.Size([96000])
%timeit sp_resample_poly(sweep_t, P, Q, window=wfilt)
32 ms ± 47.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
So not great on CPU, and that's with pytorch using multiple threads versus scipy which only uses one. What about GPU.
Q,P=48000,16000
wf_t = torch.tensor(resample_poly_filter(P, Q, L=49153), dtype=torch.float32)
evts = [torch.cuda.Event(enable_timing=True) for _ in range(4)]
start,cp_filt,cp_sig,done = evts
start.record()
wfilt_g = wf_t.to(device='cuda:0')
cp_filt.wait()
cp_filt.record()
sig_g = sweep_t.to(device='cuda:0')
cp_sig.wait()
cp_sig.record()
y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
done.record()
done.synchronize()
cp_filt_t,cp_sig_t,rs_t = [evts[i].elapsed_time(evts[i+1]) for i in range(len(evts)-1)]
print(f"Copy filer: {cp_filt_t:4.3f}ms; Copy signal: {cp_sig_t:4.3f}ms; Resample: {rs_t:4.3f}ms")
del sig_g, wfilt_g
Copy filer: 0.875ms; Copy signal: 0.360ms; Resample: 448.299ms
n_rep = 50
times = [None]*n_rep
wfilt_g = wf_t.to(device='cuda:0')
for i in range(n_rep):
sig = torch.rand(4*Q, dtype=torch.float32, device='cpu')
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
sig_g = sig.to(device='cuda:0')
y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
done.record()
done.synchronize()
times[i] = start.elapsed_time(done)
times = np.array(times)
print(f"{times.mean():3.1f}ms +/- {times.std():.3f}ms")
179.8ms +/- 9.918ms
n_rep = 5
wfilt_g = wf_t.to(device='cuda:0')
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
for i in range(n_rep):
sig_g = torch.rand(96000, dtype=torch.float32, device='cuda:0')
y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
done.record()
done.synchronize()
t = start.elapsed_time(done)
print(f"{t/1000:.3f}s; {t/n_rep:.3f}ms/item")
0.532s; 106.418ms/item
So, it doesn't perform very well. We might get faster than this by using some explicit parallelism through torch.cuda.streams
(or I think multiple threads get around some stream synchronisation issues so doing this in a databunch with num_workers>1
might help.
Think this might be inherent to the way of implementing it. We convolve our 96000 input samples with a ~65000 sample kernel. That's a lot of FLOPs. We then throw out 159/160 samples (decimating by down), so a lot of wasted calculations. I think that scipy's resample_poly
doesn't calculate those unneeded samples (but I'm not quite sure). We can't do this with our torch.nn.conv1d
implementation. Note we can't use stride as we possibly could in a non-polyphase implementation as we need to stride over the interleaved results, not each phases output, and given we reduce our up
/down
s to minimise our up
there is no common factor of up
and down
.
On the filter:
wf_t = torch.tensor(resample_poly_filter(P, Q, L=8001), dtype=torch.float32)
n_rep = 5
wfilt_g = wf_t.to(device='cuda:0')
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
for i in range(n_rep):
sig_g = torch.rand(96000, dtype=torch.float32, device='cuda:0')
y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_conv)
done.record()
done.synchronize()
t = start.elapsed_time(done)
print(f"{t/1000:.3f}s; {t/n_rep:.3f}ms/item")
0.260s; 52.055ms/item
A fair bit better, but still slower than convolution.
Removing the uneeded calculations:
%timeit resample_poly(sweep_t, P, Q, wfilt_t, upfirdn_dot)
1.27 s ± 10.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
sweep_t.shape
torch.Size([96000])
%timeit sp_resample_poly(sweep, P, Q, window=wfilt)
3.51 s ± 180 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
wf_t = torch.tensor(resample_poly_filter(P, Q, L=1001), dtype=torch.float32)
n_rep = 5
times = [None]*n_rep
wfilt_g = wf_t.to(device='cuda:0')
sig = torch.rand(96000, dtype=torch.float32, device='cuda:0')
for i in range(n_rep):
start,done = [torch.cuda.Event(enable_timing=True) for _ in range(2)]
start.record()
y_g = resample_poly(sig_g, P, Q, wfilt_g, upfirdn_dot)
done.record()
done.synchronize()
times[i] = start.elapsed_time(done)
times = np.array(times)
print(f"{times.mean():3.1f}ms +/- {times.std():.3f}ms")
5272.7ms +/- 78.641ms
Ouch, a whole lot worse. Think the sheer number of kernels we use destoys performance.
torch.cuda.get_device_properties('cuda:0')
_CudaDeviceProperties(name='GeForce RTX 2070', major=7, minor=5, total_memory=7979MB, multi_processor_count=36)
torch.arange(3).dot(torch.arange(1,4))
tensor(8)
(65537,)
Q,P=48000,44100
up = P/math.gcd(P,Q)
down = Q/math.gcd(P,Q)
N = 64*1024 + 1
wfilt = resample_poly_filter(P, Q, L=2**16+1)
wfilt_t = torch.tensor(wfilt, dtype=torch.float32)
up,down,wfilt.shape
(147.0, 160.0, (65537,))
specgram(sp_resample_poly(sweep, Q, P)*30, Fs=P)
specgram(sp_resample_poly(sweep, Q, P, window=wfilt)*30, Fs=P)
nts = [2001,4001,8001,16001,32001,49153,65537]
fig,axs = plt.subplots(len(nks), 1, figsize=(15,4*len(nks)))
for nt,ax in zip(nts,axs):
wf = resample_poly_filter(P, Q, L=nt)
y = sp_resample_poly(sweep, P, Q, window=wf)
specgram(y*30, Fs=P, ax=ax)
ax.set_title(f'N={nt}')
def freqresp(filt, Q, P, frng=(0,1.25), N_freq=1024):
'''Get the frequency response of a resampling FIR across the `frng` frequency range (expressed in terms of Fs)'''
# The filter operates on the updampled signal
# Frequencies are relative to this such that Pi is the nyquist limit
nyq_us = Q*(P/math.gcd(P,Q))/2
freqs = np.linspace(frng[0]*np.pi/down, frng[1]*np.pi/down, N_freq, endpoint=False) # pi/down will be the cutoff freq when we downsample
w,h = scipy.signal.freqz(wfilt, worN=freqs)
return (w/np.pi)*nyq_us, np.absolute(h)
w,h = freqresp(wfilt, Q, P)
plt.plot(w, h, linewidth=2);
# Deviation from ideal based on https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.kaiserord.html#scipy.signal.kaiserord
w,h = freqresp(wfilt, Q, P, frng=(0.5,1.5))
ideal = w < P/2 # Ideal filter
deviation = np.abs(h - ideal)
plt.figure(figsize=(10,4))
plt.plot(w, 20*np.log10(np.abs(deviation)))
plt.axhline(-65, color='r', ls='--', alpha=0.3)
plt.ylim(-90, -45);
wf = resample_poly_filter(P, Q, L=2**16+1, window=('blackman'))
w,h = freqresp(wf, Q, P, frng=(0.5,1.5))
ideal = w < P/2 # Ideal filter
deviation = np.abs(h - ideal)
plt.figure(figsize=(10,4))
plt.plot(w, 20*np.log10(np.abs(deviation)))
plt.axhline(-65, color='r', ls='--', alpha=0.3)
plt.ylim(-90, -45);
nts = [2001,4001,8001,16001,32001,49153,65537]
fig,axs = plt.subplots(len(nks), 1, figsize=(9,4*len(nks)))
for nt,ax in zip(nts,axs):
wf = resample_poly_filter(P, Q, L=nt)
w,h = freqresp(wf, Q, P, frng=(0.5,1.5))
ideal = w < P/2 # Ideal filter
deviation = np.abs(h - ideal)
ax.plot(w, 20*np.log10(np.abs(deviation)))
ax.axhline(-65, color='r', ls='--', alpha=0.3)
ax.set_ylim(-90, -45);
ax.set_title(f'N={nt}')