🐙
Tacotron: Towards End-to-End Speech Synthesis: https://arxiv.org/abs/1703.10135
%pylab inline
rcParams["figure.figsize"] = (16,5)
# Use text & audio modules from existing Tacotron implementation.
import sys
sys.path.append("../lib/tacotron")
from text import text_to_sequence, symbols
from util import audio
Populating the interactive namespace from numpy and matplotlib
import torch
from torch.autograd import Variable
import numpy as np
from tacotron_pytorch import Tacotron
from hparams import hparams
import os
import librosa
import librosa.display
import IPython
from IPython.display import Audio
use_cuda = torch.cuda.is_available()
fs = 20000
hop_length = 250
def visualize(alignment, spectrogram):
figure(figsize=(16,16))
subplot(2,1,1)
imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
xlabel("Decoder timestamp")
ylabel("Encoder timestamp")
colorbar()
subplot(2,1,2)
librosa.display.specshow(spectrogram.T, sr=fs,
hop_length=hop_length, x_axis="time", y_axis="linear")
colorbar()
def tts(model, text):
if use_cuda:
model = model.cuda()
# TODO: Turning off dropout of decoder's prenet causes serious performance
# regression, not sure why.
# model.decoder.eval()
model.encoder.eval()
model.postnet.eval()
sequence = np.array(text_to_sequence(text, ["english_cleaners"]))
sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
if use_cuda:
sequence = sequence.cuda()
# Greedy decoding
mel_outputs, linear_outputs, alignments = model(sequence)
linear_output = linear_outputs[0].cpu().data.numpy()
spectrogram = audio._denormalize(linear_output)
alignment = alignments[0].cpu().data.numpy()
# Predicted audio signal
waveform = audio.inv_spectrogram(linear_output.T)
return waveform, alignment, spectrogram
def test_one_text(model, text, figures=True):
waveform, alignment, spectrogram = tts(model, text)
if figures:
visualize(alignment, spectrogram)
IPython.display.display(Audio(waveform, rate=fs))
model = Tacotron(n_vocab=len(symbols))
# Choose your favorite model
checkpoint_path = "../checkpoints/checkpoint_step772000.pth"
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])
# translated from japanese: https://twitter.com/vmpmember/status/911137213189984257
text = "It seems everytime change the speech. There was inconsistent results each time as if we say in voice conversion using a hostile learning trend."
for idx in range(10):
test_one_text(model, text, figures=False)