#!/usr/bin/env python # coding: utf-8 # 🐙 # # Tacotron: Towards End-to-End Speech Synthesis: https://arxiv.org/abs/1703.10135 # # code: https://github.com/r9y9/tacotron_pytorch # In[1]: # Choose your favorite model checkpoint_path = "../checkpoints/checkpoint_step720000.pth" # In[2]: get_ipython().run_line_magic('pylab', 'inline') rcParams["figure.figsize"] = (16,5) # Use text & audio modules from existing Tacotron implementation. import sys sys.path.insert(0, "../lib/tacotron") from text import text_to_sequence, symbols from util import audio # In[20]: import torch import numpy as np from tacotron_pytorch import Tacotron from synthesis import tts as _tts from hparams import hparams import os import librosa import librosa.display import IPython from IPython.display import Audio # In[4]: fs = hparams.sample_rate hop_length = 250 # In[5]: def visualize(alignment, spectrogram): label_fontsize = 16 figure(figsize=(16,16)) subplot(2,1,1) imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) xlabel("Decoder timestamp", fontsize=label_fontsize) ylabel("Encoder timestamp", fontsize=label_fontsize) colorbar() subplot(2,1,2) librosa.display.specshow(spectrogram.T, sr=fs, hop_length=hop_length, x_axis="time", y_axis="linear") xlabel("Time", fontsize=label_fontsize) ylabel("Hz", fontsize=label_fontsize) tight_layout() colorbar() # In[6]: def tts(model, text, figures=True): waveform, alignment, spectrogram = _tts(model, text) if figures: visualize(alignment, spectrogram) IPython.display.display(Audio(waveform, rate=fs)) # ## Model # In[7]: model = Tacotron(n_vocab=len(symbols), embedding_dim=256, mel_dim=hparams.num_mels, linear_dim=hparams.num_freq, r=hparams.outputs_per_step, padding_idx=hparams.padding_idx, use_memory_mask=hparams.use_memory_mask, ) checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) # Set large max_decoder steps to handle long sentence outputs model.decoder.max_decoder_steps = 500 # ## TTS samples # ### Generatd audio and alignment # In[8]: tts(model, "Hi, my name is Tacotron. I'm still learning a lot from data.") # In[9]: tts(model, "Training neural networks is very hard!") # In[10]: tts(model, "Generative adversarial network or variational auto-encoder.") # ### Compare with keithito/tacotron # # Same sentences used in https://keithito.github.io/audio-samples/ # In[11]: texts = [ "Scientists at the CERN laboratory say they have discovered a new particle.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "President Trump met with other leaders at the Group of 20 conference.", "The Senate's bill to repeal and replace the Affordable Care Act is now imperiled.", "Generative adversarial network or variational auto-encoder.", "The buses aren't the problem, they actually provide a solution.", ] for idx, text in enumerate(texts): print(idx, text) tts(model, text, figures=False) # ### Try long inputs # # Interestingly, tacotron can generate fairly long speech samples even if it's trained from dataset consists of short clips. # In[12]: # ref: https://americanliterature.com/childrens-stories/little-red-riding-hood text = """Once upon a time there was a dear little girl who was loved by every one who looked at her, but most of all by her grandmother, and there was nothing that she would not have given to the child. """.replace('\n', ' ') print(len(text)) tts(model, text) # In[13]: # Ref: https://arxiv.org/abs/1703.10135 text = """A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module. """.replace('\n', ' ') print(len(text)) tts(model, text) # ### Failure cases: too long inputs # In[14]: # Ref: https://arxiv.org/abs/1703.10135 text = """A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module. Building these components often requires extensive domain expertise and may contain brittle design choices.""".replace('\n', ' ') print(len(text)) tts(model, text) # In[15]: # ref: https://americanliterature.com/childrens-stories/little-red-riding-hood text = """Once upon a time there was a dear little girl who was loved by every one who looked at her, but most of all by her grandmother, and there was nothing that she would not have given to the child. Once she gave her a little cap of red velvet, which suited her so well that she would never wear anything else. So she was always called Little Red Riding Hood. """ print(len(text)) tts(model, text) # ### Compare with google's demo # # Same sentences used in https://google.github.io/tacotron/ # In[16]: texts = [ "Generative adversarial network or variational auto-encoder.", "Basilar membrane and otolaryngology are not auto-correlations.", "He has read the whole thing.", "He reads books.", "Thisss isrealy awhsome.", "This is your personal assistant, Google Home.", "This is your personal assistant Google Home.", # Below two sentences are same in our experimental settings # "The buses aren't the problem, they actually provide a solution.", # "The buses aren't the PROBLEM, they actually provide a SOLUTION.", "The quick brown fox jumps over the lazy dog.", "Does the quick brown fox jump over the lazy dog?", ] for idx, text in enumerate(texts): print(idx, text) tts(model, text, figures=False) # ## Compare to ground truth # In[17]: from os.path import join from scipy.io import wavfile # In[18]: data_root = "/home/ryuichi/data/LJSpeech-1.0/" # In[19]: with open(join(data_root, "metadata.csv")) as f: lines = f.readlines()[:10] for line in lines: line = line.strip().split("|") name, text = line[0], line[1] if text[-1] not in '!,.:;?': text = text + '.' # without this decoder is confused when to output EOS print(text) # Target wav wav_path = join(data_root, "wavs", name + ".wav") _fs, target_waveform = wavfile.read(wav_path) IPython.display.display(Audio(target_waveform, rate=_fs)) # Generated wav waveform, _, _ = _tts(model, text) IPython.display.display(Audio(waveform, rate=fs)) # Well, far from ground truth:(