In [1]:
%pylab inline
rcParams["figure.figsize"] = (16,5)

import sys
sys.path.insert(0, "..")
Populating the interactive namespace from numpy and matplotlib
In [2]:
import torch

from scipy.io import wavfile
import pysptk
from pysptk.synthesis import Synthesizer, MLSADF
import pyworld
from os.path import join, basename

from nnmnkwii import preprocessing as P
from nnmnkwii.paramgen import unit_variance_mlpg_matrix

import gantts
from hparams import vc as hp

import librosa
import librosa.display
import IPython
from IPython.display import Audio
In [3]:
data_dir = "../data/cmu_arctic_vc/"
clb_wav_dir = "/home/ryuichi/data/cmu_arctic/cmu_us_clb_arctic/wav/"
slt_wav_dir = "/home/ryuichi/data/cmu_arctic/cmu_us_slt_arctic/wav/"

data_mean = np.load(join(data_dir, "data_mean.npy"))
data_var = np.load(join(data_dir, "data_var.npy"))
data_std = np.sqrt(data_var)
In [4]:
if hp.generator_params["in_dim"] is None:
    hp.generator_params["in_dim"] = data_mean.shape[-1]
if hp.generator_params["out_dim"] is None:
    hp.generator_params["out_dim"] = data_mean.shape[-1]
In [5]:
fs = 16000
hop_length = int(fs * (hp.frame_period * 0.001))
fftlen = pyworld.get_cheaptrick_fft_size(fs)
static_dim = hp.order

Models

Baseline: In2out highway networks

In [6]:
model_in2out = getattr(gantts.models, hp.generator)(**hp.generator_params)
print(model_in2out)

checkpoint_path = "../checkpoints/vc/baseline/checkpoint_epoch200_Generator.pth"
print("Load checkpoint from: {}".format(checkpoint_path))
checkpoint = torch.load(checkpoint_path)
model_in2out.load_state_dict(checkpoint["state_dict"])
In2OutHighwayNet (
  (relu): LeakyReLU (0.01)
  (sigmoid): Sigmoid ()
  (T): Linear (59 -> 59)
  (H): ModuleList (
    (0): Linear (177 -> 512)
    (1): Linear (512 -> 512)
    (2): Linear (512 -> 512)
  )
  (last_linear): Linear (512 -> 177)
  (dropout): Dropout (p = 0.5)
)
Load checkpoint from: ../checkpoints/vc/baseline/checkpoint_epoch200_Generator.pth

GAN

In [7]:
model_gan = getattr(gantts.models, hp.generator)(**hp.generator_params)
print(model_gan)

checkpoint_path = "../checkpoints/vc/gan/checkpoint_epoch200_Generator.pth"
print("Load checkpoint from: {}".format(checkpoint_path))
checkpoint = torch.load(checkpoint_path)
model_gan.load_state_dict(checkpoint["state_dict"])
In2OutHighwayNet (
  (relu): LeakyReLU (0.01)
  (sigmoid): Sigmoid ()
  (T): Linear (59 -> 59)
  (H): ModuleList (
    (0): Linear (177 -> 512)
    (1): Linear (512 -> 512)
    (2): Linear (512 -> 512)
  )
  (last_linear): Linear (512 -> 177)
  (dropout): Dropout (p = 0.5)
)
Load checkpoint from: ../checkpoints/vc/gan/checkpoint_epoch200_Generator.pth

Compare generated audio samples

Baseline vs GAN

In [8]:
from evaluation_vc import get_wav_files
from evaluation_vc import test_vc_from_path

src_test_files = get_wav_files(data_dir, clb_wav_dir, test=True)
tgt_test_files = get_wav_files(data_dir, slt_wav_dir, test=True)

for src_path, tgt_path in zip(src_test_files, tgt_test_files):
    fs, src_waveform = wavfile.read(src_path)
    fs, tgt_waveform = wavfile.read(tgt_path)
    
    in2out_waveform,_,_ = test_vc_from_path(model_in2out, src_path, data_mean, data_std, diffvc=True)
    gan_waveform,_,_ = test_vc_from_path(model_gan, src_path, data_mean, data_std, diffvc=True)
    
    print(basename(src_path), ": source, target, baseline, GAN")
    for x in [src_waveform, tgt_waveform, in2out_waveform, gan_waveform]:
        IPython.display.display(Audio(x, rate=fs))
arctic_a0496.wav : source, target, baseline, GAN
arctic_a0497.wav : source, target, baseline, GAN
arctic_a0498.wav : source, target, baseline, GAN