🐙

Tacotron: Towards End-to-End Speech Synthesis: https://arxiv.org/abs/1703.10135

In [1]:

%pylab inline
rcParams["figure.figsize"] = (16,5)

# Use text & audio modules from existing Tacotron implementation.
import sys
sys.path.append("../lib/tacotron")
from text import text_to_sequence, symbols
from util import audio

Populating the interactive namespace from numpy and matplotlib

In [2]:

import torch
from torch.autograd import Variable
import numpy as np

from tacotron_pytorch import Tacotron
from hparams import hparams

import os
import librosa
import librosa.display
import IPython
from IPython.display import Audio

In [3]:

use_cuda = torch.cuda.is_available()
fs = 20000
hop_length = 250

In [4]:

def visualize(alignment, spectrogram):
    figure(figsize=(16,16))
    
    subplot(2,1,1)
    imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    xlabel("Decoder timestamp")
    ylabel("Encoder timestamp")
    colorbar()

    subplot(2,1,2)
    librosa.display.specshow(spectrogram.T, sr=fs, 
                             hop_length=hop_length, x_axis="time", y_axis="linear")
    colorbar()

In [5]:

def tts(model, text):
    if use_cuda:
        model = model.cuda()
    # TODO: Turning off dropout of decoder's prenet causes serious performance
    # regression, not sure why.
    # model.decoder.eval()
    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text_to_sequence(text, ["english_cleaners"]))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    if use_cuda:
        sequence = sequence.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments = model(sequence)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)
    
    return waveform, alignment, spectrogram

In [6]:

def test_one_text(model, text, figures=True):
    waveform, alignment, spectrogram = tts(model, text)
    if figures:
        visualize(alignment, spectrogram)
    IPython.display.display(Audio(waveform, rate=fs))

Model¶

In [7]:

model = Tacotron(n_vocab=len(symbols))

# Choose your favorite model
checkpoint_path = "../checkpoints/checkpoint_step772000.pth"
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint["state_dict"])

Try same input multiple times¶

In [11]:

# translated from japanese: https://twitter.com/vmpmember/status/911137213189984257
text = "It seems everytime change the speech. There was inconsistent results each time as if we say in voice conversion using a hostile learning trend."
for idx in range(10):
    test_one_text(model, text, figures=False)