Note: this notebook is extremely messy as a result of me trying to rapidly prototype rocket for raw audio and not following best practices. It also uses the beta version of fastai v2 for removing silence and preprocessing. If you are interested in experimenting yourself and can't make sense of something here, please reach out to me by PM, or in the Deep Learning with Audio or Time Series threads
This notebook will apply the findings of the recent Rocket Paper by Angus Dempster, François Petitjean, Geoffrey I. Webb to 1D raw audio signals for the task of voice recognition. Some of this code is also adapted from Ignacio Oguiza and his Time Series Module for FastAI v1
Initially the signals were too long and slow to train at a sample rate of 16000 (was going to take ~30-40 minutes for 1s clips). Training a 3800 audio 10 class dataset (small problem, trains to 99%+ accuracy in 2 minutes using typical audio pipeline of spectrogram + CNN), . To speed things up I added a stride which sped up results without a drop in accuracy, but still only led to 85% accuracy after 4 minutes of training. Removing silence and doubling the amount of time to 2s allowed us to get great results (95% accuracy in 6s, 98.6% in 20 seconds, 99.2% in 1 min 20 sec).
Unfortunately so far, I have not been able to scale the results to harder problems, such as a 250 speaker dataset.
In order to spare you having to go through this network, here's a summary of the interesting results
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from local.torch_basics import *
from local.test import *
from local.basics import *
from local.data.all import *
from local.vision.core import *
from local.notebook.showdoc import show_doc
from local.audio.core import *
from local.audio.augment import *
from local.vision.learner import *
from local.vision.models.xresnet import *
from local.metrics import *
from local.callback.schedule import *
import torchaudio
from fastprogress import progress_bar as pb
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV
from rocket import generate_kernels, apply_kernel, apply_kernels
p10speakers = Config()['data_path'] / 'ST-AEDS-20180100_1-OS'
untar_data(URLs.SPEAKERS10, fname=str(p10speakers)+'.tar', dest=p10speakers)
PosixPath('/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/ST-AEDS-20180100_1-OS')
get_audio = AudioGetter("", recurse=True, folders=None)
files_10 = get_audio(p10speakers)
files_10
(#3842) [/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00446.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0002_us_m0002_00128.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0003_us_f0003_00279.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0001_us_f0001_00168.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00286.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0005_us_m0005_00282.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00432.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0005_us_f0005_00054.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0004_us_m0004_00110.wav,/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/m0003_us_m0003_00180.wav...]
audio_opener = OpenAudio(files_10)
p10_labeler = lambda x: str(x).split('/')[-1][:5] #grab the label from each file
CLIP_LENGTH = 2000
sigs, labels = [],[]
cropper = CropSignal(CLIP_LENGTH, pad_mode='repeat')
remove_silence = RemoveSilence()
for i in pb(range(len(files_10))):
sigs.append(cropper(remove_silence(audio_opener(i))).sig)
labels.append(p10_labeler(files_10[i]))
len(sigs), len(labels)
(3842, 3842)
total_size = len(sigs)
train_size = int(total_size*.8)
train_idxs = torch.randperm(total_size)[:train_size]
valid_idxs = [i for i in range(total_size) if i not in train_idxs]
assert len(train_idxs) + len(valid_idxs) == len(sigs)
x_train = [sigs[idx].squeeze(0).numpy() for idx in train_idxs]
y_train = [labels[idx] for idx in train_idxs]
x_valid = [sigs[idx].squeeze(0).numpy() for idx in valid_idxs]
y_valid = [labels[idx] for idx in valid_idxs]
list(map(len, (x_train, y_train, x_valid, y_valid)))
[3073, 3073, 769, 769]
np_x_train = np.stack(x_train).astype(np.float64)
np_x_valid = np.stack(x_valid).astype(np.float64)
np_x_train.shape, np_x_valid.shape
((3073, 32000), (769, 32000))
o2i_f = lambda x: 5*(x[0]=='m') + int(x[-1]) - 1
np_y_train = np.array(list(map(o2i_f, y_train)))
np_y_valid = np.array(list(map(o2i_f, y_valid)))
np_y_train
array([0, 7, 2, ..., 0, 7, 1])
np_x_train.shape, np_y_train.shape, np_x_valid.shape, np_y_valid.shape
((3073, 32000), (3073,), (769, 32000), (769,))
np_x_train.mean()
-4.49039649777175e-05
np_x_train = (np_x_train - np_x_train.mean(axis = 1, keepdims = True)) / (np_x_train.std(axis = 1, keepdims = True) + 1e-8)
np_x_valid = (np_x_valid - np_x_valid.mean(axis = 1, keepdims = True)) / (np_x_valid.std(axis = 1, keepdims = True) + 1e-8)
np_x_train.mean(), np_x_train.std()
(-8.10809639770585e-20, 0.9999995545301024)
np_x_train.dtype
dtype('float64')
def timing_test(runs, candidate_lengths, stride, num_kernels, seq_length, show_progress=True):
times, scores = [],[]
for i in range(runs):
kernels = generate_kernels(seq_length, num_kernels, candidate_lengths, stride)
start = time.time()
x_train_tfm = apply_kernels(np_x_train, kernels)
x_valid_tfm = apply_kernels(np_x_valid, kernels)
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7), normalize=True)
classifier.fit(x_train_tfm, np_y_train)
score = classifier.score(x_valid_tfm, np_y_valid)
t = time.time()-start
scores.append(score)
times.append(t)
if(show_progress): print("Finished Run", i+1, "Score:", round(score, 3), "Time:", round(t,3))
return times, scores
timing_test(5, np.array((7,9,11)), stride=5, num_kernels=200, seq_length=16000)
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 7 | .85 | 4:02 |
{7,9,11} | 5 | .899 | 5:20 |
{7,9,11} | 3 | .903 | 8:15 |
{800,1000,1200} | 400 | .46 | 3:43 |
Silence removed results 10000 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 5 | .979 | 5:08 |
{7,9,11} | 5 | .976 | 5:10 |
{7,9,11} | 5 | .980 | 5:26 |
Silence removed results 2000 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 5 | .972 | 1:10 |
{7,9,11} | 5 | .979 | 1:03 |
{7,9,11} | 5 | .976 | 1:01 |
Silence removed results 1000 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 5 | .974 | 0:31 |
{7,9,11} | 5 | .974 | 0:31 |
{7,9,11} | 5 | .966 | 0:31 |
Silence removed results 200 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 5 | .949 | 0:06 |
{7,9,11} | 5 | .950 | 0:06 |
{7,9,11} | 5 | .934 | 0:06 |
Silence removed results 200 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 1 | .942 | 0:30 |
{7,9,11} | 1 | .954 | 0:28 |
{7,9,11} | 1 | .950 | 0:28 |
Silence removed results 200 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 3 | .960 | 0:10 |
{7,9,11} | 3 | .948 | 0:10 |
{7,9,11} | 3 | .954 | 0:10 |
Silence removed results 200 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 5 | .987 | 0:15 |
{7,9,11} | 5 | .980 | 0:13 |
{7,9,11} | 5 | .986 | 0:13 |
Silence removed results 200 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 3 | .986 | 0:20 |
{7,9,11} | 3 | .986 | 0:20 |
{7,9,11} | 3 | .990 | 0:20 |
Silence removed results 1000 kernels
Kernel Sizes | Strides | Results | Time |
---|---|---|---|
{7,9,11} | 3 | .992 | 1:40 |
{7,9,11} | 3 | .992 | 1:40 |
{7,9,11} | 3 | .992 | 1:40 |
times_ks2, scores_ks2 = timing_test(10, np.array((2,)), stride=1, num_kernels=100, seq_length=32000, show_progress=False)
times_ks3, scores_ks3 = timing_test(10, np.array((3,)), stride=1, num_kernels=100, seq_length=32000, show_progress=False)
times_ks5, scores_ks5 = timing_test(10, np.array((5,)), stride=1, num_kernels=100, seq_length=32000, show_progress=False)
times_ks7, scores_ks7 = timing_test(10, np.array((7,)), stride=1, num_kernels=100, seq_length=32000, show_progress=False)
times_ks9, scores_ks9 = timing_test(10, np.array((9,)), stride=1, num_kernels=100, seq_length=32000, show_progress=False)
times_ks11, scores_ks11 = timing_test(10, np.array((11,)), stride=1, num_kernels=100, seq_length=32000, show_progress=False)
times_ksorig, scores_ksorig = timing_test(10, np.array((7,9,11,)), stride=1, num_kernels=100, seq_length=32000, show_progress=False)
def mn(x): return round(sum(x)/len(x), 3)
all_scores = [scores_ks2, scores_ks3, scores_ks5, scores_ks7, scores_ks9, scores_ks11, scores_ksorig,]
all_times = [times_ks2, times_ks3, times_ks5, times_ks7, times_ks9, times_ks11, times_ksorig,]
mean_times = list(map(mn, all_times))
mean_scores = list(map(mn, all_scores))
plt.plot([2,3,5,7,9,11], mean_scores[:6])
[<matplotlib.lines.Line2D at 0x7fad81118310>]
plt.plot([2,3,5,7,9,11], mean_times[:6])
[<matplotlib.lines.Line2D at 0x7fad819cd1d0>]
mean_scores
[0.924, 0.956, 0.967, 0.968, 0.969, 0.962, 0.967]
Let's rerun the last experiment, this time with longer strides
times_ks2, scores_ks2 = timing_test(10, np.array((2,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
times_ks3, scores_ks3 = timing_test(10, np.array((3,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
times_ks5, scores_ks5 = timing_test(10, np.array((5,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
times_ks7, scores_ks7 = timing_test(10, np.array((7,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
times_ks9, scores_ks9 = timing_test(10, np.array((9,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
times_ks11, scores_ks11 = timing_test(10, np.array((11,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
times_ksorig, scores_ksorig = timing_test(10, np.array((7,9,11,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
all_scores_s7 = [scores_ks2, scores_ks3, scores_ks5, scores_ks7, scores_ks9, scores_ks11, scores_ksorig,]
all_times_s7 = [times_ks2, times_ks3, times_ks5, times_ks7, times_ks9, times_ks11, times_ksorig,]
mean_times_s7 = list(map(mn, all_times_s7))
mean_scores_s7 = list(map(mn, all_scores_s7))
mean_scores_s7
[0.905, 0.937, 0.949, 0.95, 0.958, 0.951, 0.957]
plt.plot([2,3,5,7,9,11], mean_scores_s7[:6])
[<matplotlib.lines.Line2D at 0x7fad80fe3c90>]
plt.plot([2,3,5,7,9,11], mean_times_s7[:6])
[<matplotlib.lines.Line2D at 0x7fad8a14bed0>]
def mn(x): return round(sum(x)/len(x),3)
scores_ks, times_ks = [],[]
for kernel_size in range(7,100,4):
times, scores = timing_test(1, np.array((kernel_size,)), stride=7, num_kernels=100, seq_length=32000, show_progress=False)
print(f"Kernel Size {kernel_size}: Score: {mn(scores)} Time: {mn(times)}s")
scores_ks.append(mn(scores))
times_ks.append(mn(times))
Kernel Size 7: Score: 0.952 Time: 5.322s Kernel Size 11: Score: 0.962 Time: 6.52s Kernel Size 15: Score: 0.956 Time: 7.862s Kernel Size 19: Score: 0.965 Time: 8.23s Kernel Size 23: Score: 0.943 Time: 9.699s Kernel Size 27: Score: 0.953 Time: 11.522s Kernel Size 31: Score: 0.941 Time: 12.626s Kernel Size 35: Score: 0.948 Time: 12.208s Kernel Size 39: Score: 0.947 Time: 11.937s Kernel Size 43: Score: 0.931 Time: 12.709s Kernel Size 47: Score: 0.936 Time: 14.581s Kernel Size 51: Score: 0.926 Time: 15.842s Kernel Size 55: Score: 0.932 Time: 17.233s Kernel Size 59: Score: 0.926 Time: 18.059s Kernel Size 63: Score: 0.932 Time: 20.527s Kernel Size 67: Score: 0.925 Time: 20.773s Kernel Size 71: Score: 0.935 Time: 18.156s Kernel Size 75: Score: 0.915 Time: 19.595s Kernel Size 79: Score: 0.912 Time: 21.77s Kernel Size 83: Score: 0.926 Time: 23.429s Kernel Size 87: Score: 0.923 Time: 25.025s Kernel Size 91: Score: 0.923 Time: 26.44s Kernel Size 95: Score: 0.902 Time: 26.552s Kernel Size 99: Score: 0.928 Time: 29.478s
plt.xlabel("Kernel Size")
plt.ylabel("Accuracy")
plt.plot(np.arange(7,100,4), scores_ks);
plt.xlabel("Kernel Size")
plt.ylabel("Time for 100 kernels")
plt.plot(np.arange(7,100,4), times_ks);
times_ks3, scores_ks3 = timing_test(10, np.array((2,)), stride=7, num_kernels=200, seq_length=32000, show_progress=False)
mn(scores_ks3)
0.935
times_7, scores_7 = timing_test(1, np.array((7,)), stride=1, num_kernels=2500, seq_length=32000, show_progress=False)
times_9, scores_9 = timing_test(1, np.array((9,)), stride=1, num_kernels=2500, seq_length=32000, show_progress=False)
times_11, scores_11 = timing_test(1, np.array((11,)), stride=1, num_kernels=2500, seq_length=32000, show_progress=False)
times_orig, scores_orig = timing_test(1, np.array((7,9,11,)), stride=1, num_kernels=2500, seq_length=32000, show_progress=False)
mn(scores_7), mn(scores_9), mn(scores_11), mn(scores_orig)
(0.996, 0.996, 1.0, 0.995)
times_7, times_9, times_11, times_orig
([600.959691286087], [723.5468919277191], [877.7471699714661], [831.6202943325043])
times, scores = timing_test(20, np.array((7,9,11)), stride=5, num_kernels=1, seq_length=32000)
Finished Run 1 Score: 0.243 Time: 0.101 Finished Run 2 Score: 0.212 Time: 0.141 Finished Run 3 Score: 0.226 Time: 0.161 Finished Run 4 Score: 0.224 Time: 0.194 Finished Run 5 Score: 0.211 Time: 0.107 Finished Run 6 Score: 0.182 Time: 0.122 Finished Run 7 Score: 0.257 Time: 0.189 Finished Run 8 Score: 0.28 Time: 0.167 Finished Run 9 Score: 0.19 Time: 0.146 Finished Run 10 Score: 0.164 Time: 0.122 Finished Run 11 Score: 0.186 Time: 0.172 Finished Run 12 Score: 0.289 Time: 0.225 Finished Run 13 Score: 0.251 Time: 0.226 Finished Run 14 Score: 0.139 Time: 0.057 Finished Run 15 Score: 0.274 Time: 0.157 Finished Run 16 Score: 0.211 Time: 0.157 Finished Run 17 Score: 0.213 Time: 0.21 Finished Run 18 Score: 0.231 Time: 0.197 Finished Run 19 Score: 0.226 Time: 0.108 Finished Run 20 Score: 0.226 Time: 0.233
sum(scores)/len(scores)
0.22178153446033813
def get_good_kernels(runs, candidate_lengths, stride, num_kernels, seq_length, thresh, subset_size=300, show_progress=True):
good_kernels, scores = [], []
for i in range(runs):
candidate_lengths = np.array((7,))
kernels = generate_kernels(seq_length, num_kernels, candidate_lengths, stride)
idxs = torch.randperm(len(np_x_train))[:subset_size]
#print(idxs)
np_x_train_subset = np_x_train[idxs]
np_y_train_subset = np_y_train[idxs]
x_train_tfm = apply_kernels(np_x_train_subset, kernels)
x_valid_tfm = apply_kernels(np_x_valid, kernels)
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7), normalize=True)
classifier.fit(x_train_tfm, np_y_train_subset)
score = classifier.score(x_valid_tfm, np_y_valid)
if score > thresh:
good_kernels.append(kernels)
scores.append(score)
if(show_progress): print("Finished Run", i+1, "Score:", round(score, 3))
return good_kernels, scores
k, s = get_good_kernels(2000, np.array((7,)), stride=5, num_kernels=1,seq_length=32000, thresh=0.275, show_progress=False)
def merge_kernels(k):
num_kernels = len(k)
strides = np.zeros(num_kernels, dtype = np.int32)
weights = np.zeros((num_kernels, 7)) # see note
lengths = np.zeros(num_kernels, dtype = np.int32) # see note
biases = np.zeros(num_kernels)
dilations = np.zeros(num_kernels, dtype = np.int32)
paddings = np.zeros(num_kernels, dtype = np.int32)
for i in range(num_kernels):
#weights, lengths, biases, dilations, paddings, strides
weights[i], lengths[i], biases[i], dilations[i], paddings[i], strides[i] = k[i]
return weights, lengths, biases, dilations, paddings, strides
len(k)
165
kernels = merge_kernels(k)
times, scores = timing_test(10, np.array((7,)), stride=5, num_kernels=168, seq_length=32000, show_progress=True)
Finished Run 1 Score: 0.974 Time: 9.498 Finished Run 2 Score: 0.978 Time: 9.682 Finished Run 3 Score: 0.974 Time: 10.002 Finished Run 4 Score: 0.979 Time: 9.684 Finished Run 5 Score: 0.98 Time: 9.885 Finished Run 6 Score: 0.979 Time: 9.818 Finished Run 7 Score: 0.979 Time: 9.717 Finished Run 8 Score: 0.978 Time: 9.658 Finished Run 9 Score: 0.977 Time: 9.609 Finished Run 10 Score: 0.978 Time: 9.866
def score_kernels(k):
x_train_tfm = apply_kernels(np_x_train, k)
x_valid_tfm = apply_kernels(np_x_valid, k)
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7), normalize=True)
classifier.fit(x_train_tfm, np_y_train)
score = classifier.score(x_valid_tfm, np_y_valid)
return(score)
score_kernels(kernels)
0.9687906371911573
min(s), max(s), mn(s)
(0.4967490247074122, 0.5357607282184655, 0.52)
def get_dilation(input_length, filter_size):
return 2 ** np.random.uniform(0, np.log2((input_length - 1) // (filter_size - 1)))
dilations = [get_dilation(32000, 9) for i in range(100000)]
low = [int(d) for d in dilations if d < 20]
len(low)
36299
plt.hist(dilations, bins= 50);
plt.hist(low)
(array([13271., 6201., 4053., 3075., 1307., 2189., 1945., 1577., 1403., 1278.]), array([ 1. , 2.8, 4.6, 6.4, 8.2, 10. , 11.8, 13.6, 15.4, 17.2, 19. ]), <a list of 10 Patch objects>)
@njit
def generate_kernels_dilation(input_length, num_kernels, candidate_lengths=np.array((7, 9, 11)), stride=5, dilation=1):
# initialise kernel parameters
strides = np.ones(num_kernels, dtype = np.int32) * stride
weights = np.zeros((num_kernels, candidate_lengths.max())) # see note
lengths = np.zeros(num_kernels, dtype = np.int32) # see note
biases = np.zeros(num_kernels)
dilations = np.zeros(num_kernels, dtype = np.int32)
paddings = np.zeros(num_kernels, dtype = np.int32)
# note: only the first *lengths[i]* values of *weights[i]* are used
for i in range(num_kernels):
length = np.random.choice(candidate_lengths)
_weights = np.random.normal(0, 1, length)
bias = np.random.uniform(-1, 1)
padding = ((length - 1) * dilation) // 2 if np.random.randint(2) == 1 else 0
weights[i, :length] = _weights - _weights.mean()
lengths[i], biases[i], dilations[i], paddings[i] = length, bias, dilation, padding
return weights, lengths, biases, dilations, paddings, strides
def timing_test_dilation(runs, candidate_lengths, stride, num_kernels, seq_length, show_progress=True, dilation=1):
times, scores = [],[]
for i in range(runs):
kernels = generate_kernels_dilation(seq_length, num_kernels, candidate_lengths, stride, dilation)
start = time.time()
x_train_tfm = apply_kernels(np_x_train, kernels)
x_valid_tfm = apply_kernels(np_x_valid, kernels)
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7), normalize=True)
classifier.fit(x_train_tfm, np_y_train)
score = classifier.score(x_valid_tfm, np_y_valid)
t = time.time()-start
scores.append(score)
times.append(t)
if(show_progress): print("Finished Run", i+1, "Score:", round(score, 3), "Time:", round(t,3))
return times, scores
def mn(x): return round(sum(x)/len(x),3)
dilation_times, dilation_scores = [], []
dilation_choices = [1,2,4,8,16,32,64,128,256,512]
for dilation in dilation_choices:
times, scores = timing_test_dilation(3, np.array((9,)), stride=5, num_kernels=100, seq_length=32000, dilation=dilation)
dilation_times.append(mn(times))
dilation_scores.append(mn(scores))
Finished Run 1 Score: 0.882 Time: 6.582 Finished Run 2 Score: 0.866 Time: 6.703 Finished Run 3 Score: 0.876 Time: 6.641 Finished Run 1 Score: 0.884 Time: 6.745 Finished Run 2 Score: 0.879 Time: 6.639 Finished Run 3 Score: 0.891 Time: 6.553 Finished Run 1 Score: 0.935 Time: 6.618 Finished Run 2 Score: 0.927 Time: 6.789 Finished Run 3 Score: 0.934 Time: 6.582 Finished Run 1 Score: 0.966 Time: 6.587 Finished Run 2 Score: 0.965 Time: 6.609 Finished Run 3 Score: 0.973 Time: 6.606 Finished Run 1 Score: 0.958 Time: 6.683 Finished Run 2 Score: 0.966 Time: 6.503 Finished Run 3 Score: 0.96 Time: 6.71 Finished Run 1 Score: 0.944 Time: 6.516 Finished Run 2 Score: 0.935 Time: 6.606 Finished Run 3 Score: 0.935 Time: 6.689 Finished Run 1 Score: 0.896 Time: 6.714 Finished Run 2 Score: 0.895 Time: 6.669 Finished Run 3 Score: 0.921 Time: 6.726 Finished Run 1 Score: 0.856 Time: 6.559 Finished Run 2 Score: 0.84 Time: 6.491 Finished Run 3 Score: 0.839 Time: 6.619 Finished Run 1 Score: 0.688 Time: 6.618 Finished Run 2 Score: 0.7 Time: 6.814 Finished Run 3 Score: 0.73 Time: 6.608 Finished Run 1 Score: 0.559 Time: 6.434 Finished Run 2 Score: 0.572 Time: 6.62 Finished Run 3 Score: 0.551 Time: 6.822
plt.plot(dilation_choices, dilation_scores)
[<matplotlib.lines.Line2D at 0x7f2efd8b0f90>]
plt.xscale('log',basex=2)
plt.plot(dilation_choices, dilation_scores)
[<matplotlib.lines.Line2D at 0x7f2f26fccd90>]
dilation_times_s1, dilation_scores_s1 = [], []
dilation_choices_s1 = list(range(1,100,2))
for dilation in dilation_choices_s1:
times, scores = timing_test_dilation(3, np.array((9,)), stride=1, num_kernels=50, seq_length=32000, dilation=dilation)
dilation_times_s1.append(mn(times))
dilation_scores_s1.append(mn(scores))
Finished Run 1 Score: 0.878 Time: 14.549 Finished Run 2 Score: 0.866 Time: 14.888 Finished Run 3 Score: 0.901 Time: 14.822 Finished Run 1 Score: 0.896 Time: 14.919 Finished Run 2 Score: 0.867 Time: 14.778 Finished Run 3 Score: 0.886 Time: 14.611 Finished Run 1 Score: 0.93 Time: 14.642 Finished Run 2 Score: 0.926 Time: 14.622 Finished Run 3 Score: 0.926 Time: 14.865 Finished Run 1 Score: 0.931 Time: 14.681 Finished Run 2 Score: 0.952 Time: 14.774 Finished Run 3 Score: 0.944 Time: 14.654 Finished Run 1 Score: 0.941 Time: 14.715 Finished Run 2 Score: 0.948 Time: 14.705 Finished Run 3 Score: 0.953 Time: 14.948 Finished Run 1 Score: 0.948 Time: 14.583 Finished Run 2 Score: 0.964 Time: 14.786 Finished Run 3 Score: 0.953 Time: 14.838 Finished Run 1 Score: 0.949 Time: 14.716 Finished Run 2 Score: 0.948 Time: 14.958 Finished Run 3 Score: 0.944 Time: 14.608 Finished Run 1 Score: 0.956 Time: 14.703 Finished Run 2 Score: 0.951 Time: 14.645 Finished Run 3 Score: 0.945 Time: 14.827 Finished Run 1 Score: 0.944 Time: 14.907 Finished Run 2 Score: 0.932 Time: 14.732 Finished Run 3 Score: 0.953 Time: 14.749 Finished Run 1 Score: 0.934 Time: 14.842 Finished Run 2 Score: 0.936 Time: 14.786 Finished Run 3 Score: 0.934 Time: 14.776 Finished Run 1 Score: 0.918 Time: 14.902 Finished Run 2 Score: 0.926 Time: 14.77 Finished Run 3 Score: 0.93 Time: 14.729 Finished Run 1 Score: 0.923 Time: 14.735 Finished Run 2 Score: 0.941 Time: 14.648 Finished Run 3 Score: 0.935 Time: 14.906 Finished Run 1 Score: 0.912 Time: 14.78 Finished Run 2 Score: 0.915 Time: 14.993 Finished Run 3 Score: 0.93 Time: 14.697 Finished Run 1 Score: 0.926 Time: 14.718 Finished Run 2 Score: 0.927 Time: 14.732 Finished Run 3 Score: 0.919 Time: 14.908 Finished Run 1 Score: 0.925 Time: 14.836 Finished Run 2 Score: 0.91 Time: 14.777 Finished Run 3 Score: 0.921 Time: 14.546 Finished Run 1 Score: 0.922 Time: 14.754 Finished Run 2 Score: 0.936 Time: 14.711 Finished Run 3 Score: 0.931 Time: 14.709 Finished Run 1 Score: 0.919 Time: 14.655 Finished Run 2 Score: 0.914 Time: 14.71 Finished Run 3 Score: 0.922 Time: 14.666 Finished Run 1 Score: 0.926 Time: 14.834 Finished Run 2 Score: 0.923 Time: 14.819 Finished Run 3 Score: 0.922 Time: 14.6 Finished Run 1 Score: 0.91 Time: 14.784 Finished Run 2 Score: 0.915 Time: 14.683 Finished Run 3 Score: 0.91 Time: 14.979 Finished Run 1 Score: 0.914 Time: 14.826 Finished Run 2 Score: 0.904 Time: 14.8 Finished Run 3 Score: 0.906 Time: 14.59 Finished Run 1 Score: 0.891 Time: 14.707 Finished Run 2 Score: 0.914 Time: 14.521 Finished Run 3 Score: 0.893 Time: 14.799 Finished Run 1 Score: 0.912 Time: 14.816 Finished Run 2 Score: 0.901 Time: 14.74 Finished Run 3 Score: 0.886 Time: 14.697 Finished Run 1 Score: 0.899 Time: 14.834 Finished Run 2 Score: 0.886 Time: 14.563 Finished Run 3 Score: 0.88 Time: 14.822 Finished Run 1 Score: 0.908 Time: 14.667 Finished Run 2 Score: 0.889 Time: 14.644 Finished Run 3 Score: 0.904 Time: 14.715 Finished Run 1 Score: 0.917 Time: 14.94 Finished Run 2 Score: 0.91 Time: 14.903 Finished Run 3 Score: 0.895 Time: 14.803 Finished Run 1 Score: 0.893 Time: 14.607 Finished Run 2 Score: 0.909 Time: 14.907 Finished Run 3 Score: 0.897 Time: 14.747 Finished Run 1 Score: 0.893 Time: 14.604 Finished Run 2 Score: 0.897 Time: 14.374 Finished Run 3 Score: 0.892 Time: 14.582 Finished Run 1 Score: 0.875 Time: 14.716 Finished Run 2 Score: 0.892 Time: 14.94 Finished Run 3 Score: 0.892 Time: 14.722 Finished Run 1 Score: 0.88 Time: 14.622 Finished Run 2 Score: 0.879 Time: 14.721 Finished Run 3 Score: 0.889 Time: 14.776 Finished Run 1 Score: 0.87 Time: 14.488 Finished Run 2 Score: 0.858 Time: 14.663 Finished Run 3 Score: 0.867 Time: 14.558 Finished Run 1 Score: 0.873 Time: 14.767 Finished Run 2 Score: 0.867 Time: 14.594 Finished Run 3 Score: 0.887 Time: 14.619 Finished Run 1 Score: 0.883 Time: 14.65 Finished Run 2 Score: 0.896 Time: 14.624 Finished Run 3 Score: 0.896 Time: 14.561 Finished Run 1 Score: 0.878 Time: 14.789 Finished Run 2 Score: 0.863 Time: 14.719 Finished Run 3 Score: 0.888 Time: 14.594 Finished Run 1 Score: 0.893 Time: 14.561 Finished Run 2 Score: 0.865 Time: 14.638 Finished Run 3 Score: 0.882 Time: 14.724 Finished Run 1 Score: 0.857 Time: 14.438 Finished Run 2 Score: 0.847 Time: 14.775 Finished Run 3 Score: 0.843 Time: 14.786 Finished Run 1 Score: 0.876 Time: 14.681 Finished Run 2 Score: 0.854 Time: 14.593 Finished Run 3 Score: 0.858 Time: 14.71 Finished Run 1 Score: 0.862 Time: 14.673 Finished Run 2 Score: 0.875 Time: 14.582 Finished Run 3 Score: 0.861 Time: 14.604 Finished Run 1 Score: 0.858 Time: 14.714 Finished Run 2 Score: 0.858 Time: 14.805 Finished Run 3 Score: 0.871 Time: 14.836 Finished Run 1 Score: 0.848 Time: 14.815 Finished Run 2 Score: 0.844 Time: 14.783 Finished Run 3 Score: 0.856 Time: 14.726 Finished Run 1 Score: 0.836 Time: 14.944 Finished Run 2 Score: 0.857 Time: 14.762 Finished Run 3 Score: 0.843 Time: 14.744 Finished Run 1 Score: 0.826 Time: 14.824 Finished Run 2 Score: 0.853 Time: 14.541 Finished Run 3 Score: 0.831 Time: 14.737 Finished Run 1 Score: 0.831 Time: 14.801 Finished Run 2 Score: 0.827 Time: 14.658 Finished Run 3 Score: 0.854 Time: 14.864 Finished Run 1 Score: 0.837 Time: 14.826 Finished Run 2 Score: 0.843 Time: 14.59 Finished Run 3 Score: 0.841 Time: 14.695 Finished Run 1 Score: 0.862 Time: 14.857 Finished Run 2 Score: 0.841 Time: 14.672 Finished Run 3 Score: 0.848 Time: 14.481 Finished Run 1 Score: 0.813 Time: 14.62 Finished Run 2 Score: 0.836 Time: 14.812 Finished Run 3 Score: 0.814 Time: 14.819 Finished Run 1 Score: 0.827 Time: 14.675 Finished Run 2 Score: 0.835 Time: 14.727 Finished Run 3 Score: 0.834 Time: 14.889 Finished Run 1 Score: 0.831 Time: 14.705 Finished Run 2 Score: 0.852 Time: 14.881 Finished Run 3 Score: 0.86 Time: 14.648 Finished Run 1 Score: 0.832 Time: 14.854 Finished Run 2 Score: 0.865 Time: 14.65 Finished Run 3 Score: 0.849 Time: 14.488 Finished Run 1 Score: 0.862 Time: 15.05 Finished Run 2 Score: 0.843 Time: 14.832 Finished Run 3 Score: 0.865 Time: 14.816 Finished Run 1 Score: 0.839 Time: 14.581 Finished Run 2 Score: 0.871 Time: 14.629 Finished Run 3 Score: 0.834 Time: 14.494
plt.plot(dilation_choices_s1, dilation_scores_s1)
[<matplotlib.lines.Line2D at 0x7f2efdd71310>]
from collections import defaultdict
dilation_scores_dict = defaultdict(list)
dilation_times_filt = []
dilation_choices_filt = [3,5,7,9,11]
filter_choices = [5,7,9,11,13]
for dilation in dilation_choices_filt:
for filter_size in filter_choices:
times, scores = timing_test_dilation(3, np.array((filter_size,)), stride=5, num_kernels=100, seq_length=32000, dilation=dilation)
dilation_times_s1.append(mn(times))
dilation_scores_dict[filter_size].append((mn(scores)))
Finished Run 1 Score: 0.817 Time: 4.494 Finished Run 2 Score: 0.818 Time: 4.557 Finished Run 3 Score: 0.835 Time: 4.681 Finished Run 1 Score: 0.882 Time: 5.53 Finished Run 2 Score: 0.863 Time: 5.522 Finished Run 3 Score: 0.873 Time: 5.477 Finished Run 1 Score: 0.909 Time: 6.643 Finished Run 2 Score: 0.9 Time: 6.353 Finished Run 3 Score: 0.896 Time: 6.341 Finished Run 1 Score: 0.913 Time: 7.454 Finished Run 2 Score: 0.931 Time: 7.347 Finished Run 3 Score: 0.926 Time: 7.466 Finished Run 1 Score: 0.925 Time: 8.48 Finished Run 2 Score: 0.943 Time: 8.527 Finished Run 3 Score: 0.932 Time: 8.416 Finished Run 1 Score: 0.843 Time: 4.627 Finished Run 2 Score: 0.866 Time: 4.468 Finished Run 3 Score: 0.848 Time: 4.485 Finished Run 1 Score: 0.931 Time: 5.467 Finished Run 2 Score: 0.919 Time: 5.331 Finished Run 3 Score: 0.926 Time: 5.596 Finished Run 1 Score: 0.956 Time: 6.512 Finished Run 2 Score: 0.956 Time: 6.47 Finished Run 3 Score: 0.943 Time: 6.544 Finished Run 1 Score: 0.954 Time: 7.438 Finished Run 2 Score: 0.962 Time: 7.26 Finished Run 3 Score: 0.957 Time: 7.515 Finished Run 1 Score: 0.966 Time: 8.479 Finished Run 2 Score: 0.973 Time: 8.575 Finished Run 3 Score: 0.969 Time: 8.443 Finished Run 1 Score: 0.896 Time: 4.541 Finished Run 2 Score: 0.896 Time: 4.728 Finished Run 3 Score: 0.893 Time: 4.649 Finished Run 1 Score: 0.951 Time: 5.598 Finished Run 2 Score: 0.945 Time: 5.607 Finished Run 3 Score: 0.945 Time: 5.402 Finished Run 1 Score: 0.973 Time: 6.426 Finished Run 2 Score: 0.954 Time: 6.408 Finished Run 3 Score: 0.96 Time: 6.38 Finished Run 1 Score: 0.965 Time: 7.573 Finished Run 2 Score: 0.964 Time: 7.515 Finished Run 3 Score: 0.979 Time: 7.618 Finished Run 1 Score: 0.974 Time: 8.507 Finished Run 2 Score: 0.973 Time: 8.445 Finished Run 3 Score: 0.967 Time: 8.396 Finished Run 1 Score: 0.925 Time: 4.644 Finished Run 2 Score: 0.931 Time: 4.628 Finished Run 3 Score: 0.915 Time: 4.509 Finished Run 1 Score: 0.962 Time: 5.551 Finished Run 2 Score: 0.969 Time: 5.549 Finished Run 3 Score: 0.967 Time: 5.536 Finished Run 1 Score: 0.967 Time: 6.526 Finished Run 2 Score: 0.98 Time: 6.507 Finished Run 3 Score: 0.973 Time: 6.601 Finished Run 1 Score: 0.966 Time: 7.463 Finished Run 2 Score: 0.965 Time: 7.491 Finished Run 3 Score: 0.975 Time: 7.447 Finished Run 1 Score: 0.973 Time: 8.469 Finished Run 2 Score: 0.977 Time: 8.521 Finished Run 3 Score: 0.969 Time: 8.459 Finished Run 1 Score: 0.949 Time: 4.567 Finished Run 2 Score: 0.948 Time: 4.631 Finished Run 3 Score: 0.944 Time: 4.5 Finished Run 1 Score: 0.967 Time: 5.534 Finished Run 2 Score: 0.965 Time: 5.42 Finished Run 3 Score: 0.962 Time: 5.598 Finished Run 1 Score: 0.974 Time: 6.47 Finished Run 2 Score: 0.964 Time: 6.559 Finished Run 3 Score: 0.975 Time: 6.488 Finished Run 1 Score: 0.966 Time: 7.595 Finished Run 2 Score: 0.974 Time: 7.46 Finished Run 3 Score: 0.966 Time: 7.594 Finished Run 1 Score: 0.97 Time: 8.581 Finished Run 2 Score: 0.965 Time: 8.631 Finished Run 3 Score: 0.961 Time: 8.423
dilation_scores_dict
defaultdict(list, {5: [0.823, 0.852, 0.895, 0.924, 0.947], 7: [0.873, 0.925, 0.947, 0.966, 0.965], 9: [0.902, 0.951, 0.962, 0.974, 0.971], 11: [0.923, 0.958, 0.969, 0.969, 0.969], 13: [0.933, 0.969, 0.971, 0.973, 0.965]})
for i, v in enumerate(dilation_scores_dict.values()):
plt.plot(dilation_choices_filt, v, label=filter_choices[i])
plt.legend(title="Filter Size")
plt.title("")
plt.xlabel("dilation")
plt.ylabel("100 Kernel Accuracy")
times, scores = timing_test_dilation(1, np.array((9,)), stride=5, num_kernels=500, seq_length=32000, dilation=8)
Finished Run 1 Score: 0.988 Time: 32.032
from scipy.stats import skewnorm
a=6
data= skewnorm.rvs(a, size=1000)
plt.hist(data)
(array([ 31., 192., 252., 178., 135., 110., 48., 34., 14., 6.]), array([-0.40513973, -0.06437618, 0.27638737, 0.61715092, 0.95791447, 1.29867802, 1.63944157, 1.98020512, 2.32096867, 2.66173222, 3.00249577]), <a list of 10 Patch objects>)
from scipy.stats import skewnorm
#dilation = 2 ** np.random.uniform(0, np.log2((input_length - 1) // (length - 1)))
def dilation_strategy(num_kernels, input_length, max_length):
max_exponent= np.log10((input_length - 1) // (max_length - 1))
dist = skewnorm.rvs(10, size=num_kernels)
clean_dist = [d for d in dist if d >= 0 and d < max_exponent]
dilations = [int(10**d) for d in clean_dist]
#rint(dilations)
return np.array(dilations[:num_kernels], np.int32)
dilations = dilation_strategy(10000,32000, 9)
print(dilations)
[ 2 2 3 ... 4 1 96]
from collections import Counter
c = Counter(dilations)
c[3]
873
sorted(c, key=lambda x: c[x], reverse=True)[:50]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11, 13, 14, 15, 18, 17, 16, 20, 19, 21, 24, 22, 28, 25, 26, 23, 37, 30, 32, 33, 34, 27, 31, 35, 29, 44, 39, 49, 42, 54, 47, 36, 41, 40, 53, 43, 70, 51, 52]
dilations.max()
3995
dilations[1]
1
plt.xscale('log',basex=10)
plt.hist(dilations,bins=50);
@njit
def generate_kernels_dilation_strategy(input_length, num_kernels, candidate_lengths=np.array((7, 9, 11)), stride=5, dilations=dilations):
# initialise kernel parameters
strides = np.ones(num_kernels, dtype = np.int32) * stride
weights = np.zeros((num_kernels, candidate_lengths.max())) # see note
lengths = np.zeros(num_kernels, dtype = np.int32) # see note
biases = np.zeros(num_kernels)
paddings = np.zeros(num_kernels, dtype = np.int32)
# note: only the first *lengths[i]* values of *weights[i]* are used
for i in range(num_kernels):
length = np.random.choice(candidate_lengths)
_weights = np.random.normal(0, 1, length)
bias = np.random.uniform(-1, 1)
padding = ((length - 1) * dilations[i]) // 2 if np.random.randint(2) == 1 else 0
weights[i, :length] = _weights - _weights.mean()
lengths[i], biases[i], paddings[i] = length, bias, padding
return weights, lengths, biases, dilations, paddings, strides
@njit(parallel = True, fastmath = True)
def apply_kernels(X, kernels):
weights, lengths, biases, dilations, paddings, strides = kernels
num_examples = len(X)
num_kernels = len(weights)
# initialise output
_X = np.zeros((num_examples, num_kernels * 2)) # 2 features per kernel
for i in prange(num_examples):
for j in range(num_kernels):
_X[i, (j * 2):((j * 2) + 2)] = \
apply_kernel(X[i], weights[j][:lengths[j]], lengths[j], biases[j], dilations[j], paddings[j], strides[j])
return _X
def timing_test_dilation_strategy(runs, candidate_lengths, stride, num_kernels, seq_length, show_progress=True):
times, scores = [],[]
for i in range(runs):
dilations = simple_dilation_strategy(num_kernels)
kernels = generate_kernels_dilation_strategy(seq_length, num_kernels, candidate_lengths, stride, dilations)
start = time.time()
x_train_tfm = apply_kernels(np_x_train, kernels)
x_valid_tfm = apply_kernels(np_x_valid, kernels)
classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 7), normalize=True)
classifier.fit(x_train_tfm, np_y_train)
score = classifier.score(x_valid_tfm, np_y_valid)
t = time.time()-start
scores.append(score)
times.append(t)
if(show_progress): print("Finished Run", i+1, "Score:", round(score, 3), "Time:", round(t,3))
return times, scores
timing_test_dilation_strategy(1, np.array((7,9,11,)), stride=5, num_kernels=1000, seq_length=32000)
Finished Run 1 Score: 0.965 Time: 68.379
([68.37943172454834], [0.9648894668400521])