using PyCall matplotlib = pyimport("matplotlib") PyDict(matplotlib["rcParams"])["figure.figsize"] = (12, 5) using PyPlot # https://gist.github.com/jfsantos/a39ed69a7894876f1e04#file-audiodisplay-jl # Thanks, @jfsantos include("AudioDisplay.jl") using WAV using DSP using MelGeneralizedCepstrums # to esimate spectral envelope parameters using SynthesisFilters # plotting utilities function wavplot(x; label="a waveform", x_label="sample") plot(1:endof(x), x, "b", label=label) xlim(1, endof(x)) xlabel(x_label) legend() end function wavcompare(x, y; label="synthesized waveform", x_label="sample") plot(1:endof(y), y, "r-+", label=label) plot(1:endof(x), x, label="original speech signal") xlim(1, endof(x)) xlabel(x_label) legend() end x, fs = wavread(joinpath(dirname(@__FILE__), "data", "test16k.wav"), format="native") x = convert(Vector{Float64}, vec(x)) fs = convert(Int, fs) wavplot(x) inline_audioplayer(map(Int16, x), fs) # Note about excitation # fs: 16000 # frame period: 5.0 ms # F0 analysis: esimated by WORLD.dio and WORLD.stonemask # Excitation signal: perioic pulse for voiced segments and gaussian random values for un-voiced segments # base_excitation = vec(readdlm(joinpath(dirname(@__FILE__), "data", "test16k_excitation.txt"))) # Mixed excitation generated using WORLD.jl base_excitation = vec(readdlm(joinpath(dirname(@__FILE__), "data", "test16k_world_excitation.txt"))) wavplot(base_excitation, label="mixed excitation") inline_audioplayer(base_excitation ./ maximum(base_excitation), fs) framelen = 512 hopsize = 80 # 5.0 ms for fs 16000 noverlap = framelen - hopsize # Note that mgcep analysis basically assumes power-normalized window so that Σₙ w(n)² = 1 win = DSP.blackman(framelen) ./ sqrt(sumabs2(DSP.blackman(framelen))) @assert isapprox(sumabs2(win), 1.0) # create windowed signal matrix that each column represents a windowed time slice as = arraysplit(x, framelen, noverlap) xw = Array(Float64, framelen, length(as)) for t=1:length(as) xw[:,t] = as[t] end # col-wise windowing xw .*= win; @show size(xw) c = estimate(MelCepstrum(20, mcepalpha(fs)), xw) imshow(c, origin="lower", aspect="auto") colorbar() # Let's see spectral envelope estimate imshow(real(mgc2sp(c, framelen)), origin="lower", aspect="auto") colorbar() c = estimate(LinearCepstrum(25), xw) y = synthesis(base_excitation, c, hopsize) wavcompare(x, y, label="Cepstrum-based synthesized waveform") inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs) c = estimate(MelCepstrum(25, mcepalpha(fs)), xw) y = synthesis(base_excitation, c, hopsize) wavcompare(x, y, label="Mel-cepstrum-based synthesized waveform") inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs) c = estimate(MelGeneralizedCepstrum(25, mcepalpha(fs), -1/4), xw) y = synthesis(base_excitation, c, hopsize) wavcompare(x, y, label="Mel-generalized cepstrum based synthesized waveform") inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs) l = estimate(LinearPredictionCoef(25), xw, use_mgcep=true) y = synthesis(base_excitation, l, hopsize) wavcompare(x, y, label="LPC-based synthesized waveform") inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs) l = lpc2par(estimate(LinearPredictionCoef(25), xw)) y = synthesis(base_excitation, l, hopsize) wavcompare(x, y, label="PARCOR-based synthesized waveform") inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs) l = lpc2lsp(estimate(LinearPredictionCoef(15), xw)) y = synthesis(base_excitation, l, hopsize) wavcompare(x, y, label="LSP-based synthesized waveform") inline_audioplayer(round(Int16, clamp(y, typemin(Int16), typemax(Int16))), fs)