#!/usr/bin/env python
# coding: utf-8

# # Speech analysis and re-synthesis
# 
# This notebook demonstrates how to analyze speech and re-synthesis speech waveform from speech parameters using [pysptk](https://github.com/r9y9/pysptk) (and other useful speech/audio/music analysis packages). Synthesized audio examples are provided so that you are able to compare synthesis filters on your browser.
# 
# ## Requirements
# 
# - pysptk: https://github.com/r9y9/pysptk
# - scipy
# - librosa: https://github.com/bmcfee/librosa
# - seaborn: https://github.com/mwaskom/seaborn

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')


# In[2]:


import matplotlib
import seaborn
seaborn.set(style="whitegrid")
rcParams['figure.figsize'] = (16, 5)


# In[3]:


from IPython.display import Audio


# In[4]:


import numpy as np
import librosa
import librosa.display
import pysptk
from scipy.io import wavfile


# ## Data

# In[5]:


sr, x = wavfile.read(pysptk.util.example_audio_file())
assert sr == 16000
x = x.astype(np.float64)
print(x.shape)


# In[6]:


librosa.display.waveplot(x, sr=sr)
title("Raw waveform of example audio flle")
Audio(x, rate=sr)


# ## Source parameter extraction
# 
# ### Framing and windowing

# In[7]:


frame_length = 1024
hop_length = 80

# Note that almost all of pysptk functions assume input array is C-contiguous and np.float64 element type
frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T

# Windowing
frames *= pysptk.blackman(frame_length)

assert frames.shape[1] == frame_length 


# ### F0 estimation

# In[8]:


# F0 estimation
f0 = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="f0")
plot(f0, linewidth=3, label="F0 trajectory estimated by SWIPE'")
xlim(0, len(f0))
legend(prop={'size': 16});


# ## Source excitation generation

# In[9]:


# Note that pysptk.excite assuems input as pitch, not *f0*.
pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch")
source_excitation = pysptk.excite(pitch, hop_length)
                            
plot(source_excitation, label="Source excitation")
xlim(0, len(source_excitation))
ylim(-2, 16)
legend(prop={'size': 16});


# ## Synthesis from mel-cepstrum

# In[10]:


# Order of mel-cepstrum
order = 25
alpha = 0.41

mc = pysptk.mcep(frames, order, alpha)
logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real
librosa.display.specshow(logH.T, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear")
colorbar()
title("Spectral envelope estimate from mel-cepstrum");


# In[11]:


from pysptk.synthesis import MLSADF, Synthesizer

# Convert mel-cesptrum to MLSADF coefficients
b = pysptk.mc2b(mc, alpha);

synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length)

x_synthesized = synthesizer.synthesis(source_excitation, b)

librosa.display.waveplot(x_synthesized, sr=sr)
title("Synthesized waveform by MLSADF")
Audio(x_synthesized, rate=sr)


# ## Synthesis from cepstrum

# In[12]:


# Order of mel-cepstrum
order = 25

c = pysptk.mcep(frames, order, 0.0)

logH = pysptk.mgc2sp(c, 0.0, 0.0, frame_length).real
librosa.display.specshow(logH.T, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear")
colorbar()
title("Spectral envelope estimate from cepstrum");


# In[13]:


from pysptk.synthesis import LMADF

synthesizer = Synthesizer(LMADF(order=order), hop_length)

x_synthesized = synthesizer.synthesis(source_excitation, c)

librosa.display.waveplot(x_synthesized, sr=sr)
title("Synthesized waveform by LMADF")
Audio(x_synthesized, rate=sr);


# ## Synthesis from mel-generalized cepstrum

# In[14]:


# Order of mel-cepstrum
order = 25
alpha = 0.41
stage = 5
gamma = -1.0 / stage

mgc = pysptk.mgcep(frames, order, alpha, gamma)

logH = pysptk.mgc2sp(mgc, alpha, gamma, frame_length).real
librosa.display.specshow(logH.T, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear")
colorbar()
title("Spectral envelope estimate from mel-generalized cepstrm");


# In[15]:


from pysptk.synthesis import MGLSADF

# Convert mel-generalized cesptrum to MGLSADF coefficients
b = pysptk.mgc2b(mgc, alpha, gamma)

synthesizer = Synthesizer(MGLSADF(order=order, alpha=alpha, stage=stage), hop_length)

x_synthesized = synthesizer.synthesis(source_excitation, b)

librosa.display.waveplot(x_synthesized, sr=sr)
title("Synthesized waveform by MGLSADF")
Audio(x_synthesized, rate=sr)


# ## Synthesis from LPC

# In[16]:


# Order of mel-cepstrum
order = 25

# apply function along with `time` axis (=1)
mgc = pysptk.mgcep(frames, order, 0.0, -1.0)

logH = pysptk.mgc2sp(mgc, 0.0, -1.0, frame_length).real
librosa.display.specshow(logH.T, sr=sr, hop_length=hop_length, x_axis="time", y_axis="linear")
colorbar()
title("Spectral envelope estimate from mel-generalized cepstrum where alpha = 0 and gamma = -1.0");


# In[17]:


from pysptk.synthesis import AllPoleDF

lpc = pysptk.lpc(frames, order)

# make sure lpc has loggain
lpc[:, 0] = np.log(lpc[:, 0])

synthesizer = Synthesizer(AllPoleDF(order=order), hop_length)

x_synthesized = synthesizer.synthesis(source_excitation, lpc)

librosa.display.waveplot(x_synthesized, sr=sr)
title("Synthesized waveform by AllPoleDF")
Audio(x_synthesized, rate=sr)


# For more synthesis filters, please refer the documentation: http://pysptk.readthedocs.org/