This iPython Notebook presents a demo of the offline speech enhancement algorithm presented in:
DOI: 10.1109/TASLP.2017.2656805
We follow the same structure as the offline blind speech separation with GCC-NMF demo presented previously. Speech enhancement is performed directly on the stereo mixture signal using no additional data:
This demo enhances the speech from the data/dev_Sq1_Co_A_mix.wav mixture, taken from the SiSEC 2016 Two-channel mixtures of speech and real-world background noise "dev" dataset, and saves results to the data directory.
from gccNMF.gccNMFFunctions import *
from gccNMF.gccNMFPlotting import *
from IPython import display
%matplotlib inline
# Preprocessing params
windowSize = 1024
fftSize = windowSize
hopSize = 128
windowFunction = hanning
# TDOA params
numTDOAs = 128
targetTDOAWindowSizePercent = 0.05
# NMF params
dictionarySize = 128
numIterations = 100
sparsityAlpha = 0
# Input params
mixtureFileNamePrefix = '../data/dev_Sq1_Co_A'
microphoneSeparationInMetres = 0.086
numSources = 1
hypothesisTDOAs = getTDOAsInSeconds(microphoneSeparationInMetres, numTDOAs)
targetTDOAWindowSize = (hypothesisTDOAs[-1] - hypothesisTDOAs[0]) * targetTDOAWindowSizePercent
mixtureFileName = getMixtureFileName(mixtureFileNamePrefix)
stereoSamples, sampleRate = loadMixtureSignal(mixtureFileName)
numChannels, numSamples = stereoSamples.shape
durationInSeconds = numSamples / float(sampleRate)
describeMixtureSignal(stereoSamples, sampleRate)
figure(figsize=(14, 6))
plotMixtureSignal(stereoSamples, sampleRate)
display.display( display.Audio(mixtureFileName) )
Input mixture signal: sampleRate: 16000 samples/sec numChannels: 2 numSamples: 160000 dtype: float32 duration: 10.00 seconds
complexMixtureSpectrogram = computeComplexMixtureSpectrogram( stereoSamples, windowSize,
hopSize, windowFunction )
numChannels, numFrequencies, numTime = complexMixtureSpectrogram.shape
frequenciesInHz = getFrequenciesInHz(sampleRate, numFrequencies)
frequenciesInkHz = frequenciesInHz / 1000.0
describeMixtureSpectrograms(windowSize, hopSize, windowFunction, complexMixtureSpectrogram)
figure(figsize=(12, 8))
plotMixtureSpectrograms(complexMixtureSpectrogram, frequenciesInkHz, durationInSeconds)
STFT: windowSize: 1024 hopSize: 128 windowFunction: <function hanning at 0x10d683d90> complexMixtureSpectrogram.shape = (numChannels, numFreq, numWindows): (2, 513, 1243) complexMixtureSpectrogram.dtype = complex64
spectralCoherenceV = complexMixtureSpectrogram[0] * complexMixtureSpectrogram[1].conj() \
/ abs(complexMixtureSpectrogram[0]) / abs(complexMixtureSpectrogram[1])
angularSpectrogram = getAngularSpectrogram( spectralCoherenceV, frequenciesInHz,
microphoneSeparationInMetres, numTDOAs )
meanAngularSpectrum = mean(angularSpectrogram, axis=-1)
targetTDOAIndexes = estimateTargetTDOAIndexesFromAngularSpectrum( meanAngularSpectrum,
microphoneSeparationInMetres,
numTDOAs, numSources)
targetTDOAIndex = targetTDOAIndexes[0]
targetTDOA = hypothesisTDOAs[targetTDOAIndex]
figure(figsize=(14, 6))
plotGCCPHATLocalization( spectralCoherenceV, angularSpectrogram, meanAngularSpectrum,
targetTDOAIndexes, microphoneSeparationInMetres, numTDOAs,
durationInSeconds )
V = concatenate( abs(complexMixtureSpectrogram), axis=-1 )
W, H = performKLNMF(V, dictionarySize, numIterations, sparsityAlpha)
numChannels = stereoSamples.shape[0]
stereoH = array( hsplit(H, numChannels) )
describeNMFDecomposition(V, W, H)
figure(figsize=(12, 12))
plotNMFDecomposition(V, W, H, frequenciesInkHz, durationInSeconds, numAtomsToPlot=15)
Input V: V.shape = (numFreq, numWindows): (513, 2486) V.dtype = float32 Dictionary W: W.shape = (numFreq, numAtoms): (513, 128) W.dtype = float32 Coefficients H: H.shape = (numAtoms, numWindows): (128, 2486) H.dtype = float32
expJOmegaTau = exp( outer(frequenciesInHz, -(2j * pi) * hypothesisTDOAs) )
gccNMFs = []
for frameIndex in range(numTime):
gccNMF = dot( (spectralCoherenceV[:, frameIndex, newaxis] * expJOmegaTau).real.T, W )
gccNMFs.append(gccNMF)
gccNMF = array(gccNMFs).T
argMaxGCCNMF = argmax(gccNMF, axis=1)
gccNMFMaxTDOA = np.take(hypothesisTDOAs, argMaxGCCNMF)
distanceToTargetTDOA = abs(gccNMFMaxTDOA - targetTDOA)
targetCoefficientMask = distanceToTargetTDOA < targetTDOAWindowSize
targetCoefficientMasks = array([targetCoefficientMask])
figure(figsize=(16, 6))
subplot(131)
imshow( gccNMFMaxTDOA, cmap=cm.jet, extent=[0, durationInSeconds, 0, dictionarySize-1])
colorbar()
ylabel('Atom Index')
title('GCC-NMF: max TDOA')
xlabel('Time (s)')
ax = subplot(132)
imshow( distanceToTargetTDOA, cmap=cm.jet, extent=[0, durationInSeconds, 0, dictionarySize-1])
colorbar()
title('GCC-NMF Target TDOA distance')
xlabel('Time (s)')
ax.set_yticklabels([])
ax = subplot(133)
imshow(targetCoefficientMask, cmap=cm.binary, extent=[0, durationInSeconds, 0, dictionarySize-1])
colorbar()
title('GCC-NMF Coefficient Mask')
xlabel('Time (s)')
ax.set_yticklabels([])
show()
figure(figsize=(12, 12))
plotCoefficientMasks(targetCoefficientMasks, stereoH, durationInSeconds)
targetSpectrogramEstimates = getTargetSpectrogramEstimates( targetCoefficientMasks,
complexMixtureSpectrogram, W,
stereoH )
figure(figsize=(16, 8))
plotTargetSpectrogramEstimates(targetSpectrogramEstimates, durationInSeconds, frequenciesInkHz)
targetSignalEstimates = getTargetSignalEstimates( targetSpectrogramEstimates, windowSize,
hopSize, windowFunction )
saveTargetSignalEstimates(targetSignalEstimates, sampleRate, mixtureFileNamePrefix)
figure(figsize=(14, 2))
plotTargetSignalEstimate(stereoSamples, sampleRate, 'Input Mixture')
figure(figsize=(14, 2))
targetFileName = getSourceEstimateFileName(mixtureFileNamePrefix, 0)
plotTargetSignalEstimate( targetSignalEstimates[0], sampleRate, 'Target' )
print('Input Mixture')
display.display( display.Audio(mixtureFileName) )
print('Target Estimate')
display.display(display.Audio(targetFileName))
Input Mixture
Target Estimate
The trade-off between noise suppression and target fidelity may be controlled via the targetTDOAWindowSizePercent variable...