声優統計のデータを使った、簡単なGMM声質変換のデモノートブックです。下の方に音声サンプルが貼られているので、変換結果の音声を聴き比べることができます。
使用しているライブラリは、一部未公開ですすいません
Trans. Audio, Speech, Lang. Process, vol. 15, no. 8, pp. 2222–2235, Nov. 2007.](http://isw3.naist.jp/~tomoki/Tomoki/Journals/IEEE-Nov-2007_MLVC.pdf)
%pylab inline
rcParams["figure.figsize"] = (16,5)
from nnmnkwii.datasets import BatchDataset
from nnmnkwii.datasets.voice_statistics import VoiceStatisticsWavDataSource
from nnmnkwii.preprocessing.alignment import DTWAligner
from nnmnkwii.preprocessing import DeltaAppender
from nnmnkwii.utils import trim_zeros_frames, remove_zeros_frames
from nnmnkwii.metrics import melcd
from nnmnkwii.baseline.gmm import MLPG
from os.path import join, expanduser, basename, splitext
import sys
import time
import numpy as np
from scipy.io import wavfile
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import pyworld
import pysptk
from pysptk.synthesis import MLSADF, Synthesizer
import librosa
import librosa.display
import IPython
from IPython.display import Audio
Populating the interactive namespace from numpy and matplotlib
声質変換統計のデータを適当なところに置きます
DATA_ROOT = join(expanduser("~"), "data", "voice-statistics")
!ls $DATA_ROOT
balance_sentences.txt fujitou_normal tsuchiya_normal uemura_normal fujitou_angry tsuchiya_angry uemura_angry fujitou_happy tsuchiya_happy uemura_happy
fs = 48000
fftlen = pyworld.get_cheaptrick_fft_size(fs)
alpha = pysptk.util.mcepalpha(fs)
order = 60
frame_period = 5
hop_length = int(fs * (frame_period * 0.001))
max_files = 30
test_size = 0.1
use_delta = True
if use_delta:
windows = [
(0, 0, np.array([1.0])),
(1, 1, np.array([-0.5, 0.0, 0.5])),
(1, 1, np.array([1.0, -2.0, 1.0])),
]
else:
windows = [
(0, 0, np.array([1.0])),
]
class MyDataSource(VoiceStatisticsWavDataSource):
def __init__(self, *args, **kwargs):
super(MyDataSource, self).__init__(*args, **kwargs)
self.test_paths = None
def collect_files(self):
paths, labels = super(
MyDataSource, self).collect_files()
paths_train, paths_test, labels_train, labels_test = train_test_split(
paths, labels, test_size=test_size, random_state=1234)
# keep paths for later testing
self.test_paths = paths_test
return paths_train, labels_train
def process_file(self, path):
fs, x = wavfile.read(path)
x = x.astype(np.float64)
f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
f0 = pyworld.stonemask(x, f0, timeaxis, fs)
spectrogram = trim_zeros_frames(pyworld.cheaptrick(x, f0, timeaxis, fs))
mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
return mc
# Data sources
# Since we need to create a parallel dataset, read data separetely
fujitou_source = MyDataSource(data_root=DATA_ROOT,
speakers=["fujitou"], max_files_per_dir=max_files)
tsuhiya_source = MyDataSource(data_root=DATA_ROOT,
speakers=["tsuchiya"], max_files_per_dir=max_files)
Since we are using small dataset, it's enough to load entire dataset into memory at once.
# Build dataset as 3D tensor (NxTxD)
X, _ = BatchDataset(fujitou_source, max_num_frames=3200).load()
Y, _ = BatchDataset(tsuhiya_source, max_num_frames=3200).load()
print(X.shape)
print(Y.shape)
/home/ryuichi/anaconda3/lib/python3.6/site-packages/scipy/io/wavfile.py:273: WavFileWarning: Chunk (non-data) not understood, skipping it. WavFileWarning)
(27, 3200, 61) (27, 3200, 61)
# Plotting util
def plot_parallel(x,y):
figure(figsize=(16,7))
subplot(2,1,1)
librosa.display.specshow(trim_zeros_frames(x).T, sr=fs, hop_length=hop_length, x_axis="time")
colorbar()
subplot(2,1,2)
librosa.display.specshow(trim_zeros_frames(y).T, sr=fs, hop_length=hop_length, x_axis="time")
colorbar()
idx = 0 # any
plot_parallel(X[idx],Y[idx])
Using DTW, aligning features between speakers.
# Alignment
X_aligned, Y_aligned = DTWAligner(verbose=0, dist=melcd).transform((X, Y))
plot_parallel(X_aligned[idx],Y_aligned[idx])
num_utterances = len(X)
utt_lengths = np.zeros(num_utterances, dtype=np.int)
for i, (x, y) in enumerate(zip(X_aligned, Y_aligned)):
utt_lengths[i] = np.maximum(
len(trim_zeros_frames(x)), len(trim_zeros_frames(y)))
hist(utt_lengths, bins=64);
# Drop 1st (power) dimention
X_aligned, Y_aligned = X_aligned[:, :, 1:], Y_aligned[:, :, 1:]
Later we can use the delta features to sequence-wise parameter generation; Maximum Likelihood Parameter Generation (MLPG).
static_dim = X_aligned.shape[-1]
if use_delta:
X_aligned = DeltaAppender(windows=windows).transform(X_aligned)
Y_aligned = DeltaAppender(windows=windows).transform(Y_aligned)
plot_parallel(X_aligned[idx],Y_aligned[idx])
This is used to train GMMs.
XY = np.concatenate((X_aligned, Y_aligned), axis=-1).reshape(-1, X_aligned.shape[-1]*2)
print(XY.shape)
(86400, 360)
XY = remove_zeros_frames(XY)
print(XY.shape)
(55955, 360)
gmm = GaussianMixture(
n_components=64, covariance_type="full", max_iter=100, verbose=1)
%time gmm.fit(XY)
Initialization 0 Iteration 0 Iteration 10 Iteration 20 Initialization converged: True CPU times: user 45min 3s, sys: 5min 37s, total: 50min 40s Wall time: 12min 55s
GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100, means_init=None, n_components=64, n_init=1, precisions_init=None, random_state=None, reg_covar=1e-06, tol=0.001, verbose=1, verbose_interval=10, warm_start=False, weights_init=None)
for k in range(3):
plot(gmm.means_[k], linewidth=1.5, label="Mean of mixture {}".format(k+1))
legend(prop={"size": 16})
<matplotlib.legend.Legend at 0x7f4da1f6b780>
imshow(gmm.covariances_[0], origin="bottom left")
colorbar()
<matplotlib.colorbar.Colorbar at 0x7f4da0021b00>
Digonal part of covariance matrix, which is expected to be dominant.
for k in range(3):
plot(np.diag(gmm.covariances_[k]), linewidth=1.5,
label="Diagonal part of covariance matrix, mixture {}".format(k))
legend(prop={"size": 16})
<matplotlib.legend.Legend at 0x7f4da0040ac8>
Once we train GMMs on joint features, we can perform parameter generation using MLPG.
# Set true to enable voice conversion based on spectral differencial
diffvc = True
len(fujitou_source.test_paths)
3
def test_one_utt(src_path, tgt_path, disable_mlpg=False):
# GMM-based parameter generation is provided by the library in `baseline` module
if disable_mlpg:
# Force disable MLPG
paramgen = MLPG(gmm, windows=[(0,0, np.array([1.0]))], diff=diffvc)
else:
paramgen = MLPG(gmm, windows=windows, diff=diffvc)
fs, x = wavfile.read(src_path)
x = x.astype(np.float64)
f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
f0 = pyworld.stonemask(x, f0, timeaxis, fs)
spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
c0, mc = mc[:, 0], mc[:, 1:]
if use_delta:
mc = DeltaAppender(windows=windows).transform(
mc[None, :, :]).reshape(-1, static_dim * len(windows))
mc = paramgen.transform(mc)
if disable_mlpg and mc.shape[-1] != static_dim:
mc = mc[:,:static_dim]
assert mc.shape[-1] == static_dim
mc = np.hstack((c0[:, None], mc))
if diffvc:
mc[:, 0] = 0 # remove power coefficients
engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length)
b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
waveform = engine.synthesis(x, b)
else:
spectrogram = pysptk.mc2sp(
mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
waveform = pyworld.synthesize(
f0, spectrogram, aperiodicity, fs, frame_period)
_, y = wavfile.read(tgt_path)
return x, y, waveform
Compare 1) w/o MLPG and 2) w/ MLPG
for i, (src_path, tgt_path) in enumerate(zip(fujitou_source.test_paths, tsuhiya_source.test_paths)):
print("{}-th sample".format(i+1))
x, y, wo_MLPG = test_one_utt(src_path, tgt_path, disable_mlpg=True)
_, _, w_MLPG = test_one_utt(src_path, tgt_path, disable_mlpg=False)
# Notebook staff
print(src_path)
IPython.display.display(Audio(x, rate=fs))
print(tgt_path)
IPython.display.display(Audio(y, rate=fs))
print("w/o MLPG")
IPython.display.display(Audio(wo_MLPG, rate=fs))
print("w/ MLPG")
IPython.display.display(Audio(w_MLPG, rate=fs))
1-th sample
/home/ryuichi/anaconda3/lib/python3.6/site-packages/scipy/io/wavfile.py:273: WavFileWarning: Chunk (non-data) not understood, skipping it. WavFileWarning)
/home/ryuichi/data/voice-statistics/fujitou_normal/fujitou_normal_008.wav
/home/ryuichi/data/voice-statistics/tsuchiya_normal/tsuchiya_normal_008.wav
w/o MLPG
w/ MLPG
2-th sample /home/ryuichi/data/voice-statistics/fujitou_normal/fujitou_normal_011.wav
/home/ryuichi/data/voice-statistics/tsuchiya_normal/tsuchiya_normal_011.wav
w/o MLPG
w/ MLPG
3-th sample /home/ryuichi/data/voice-statistics/fujitou_normal/fujitou_normal_005.wav
/home/ryuichi/data/voice-statistics/tsuchiya_normal/tsuchiya_normal_005.wav
w/o MLPG
w/ MLPG
def vis_difference(x, y, which_dims=[0,2,3,6,8], T_max=None):
# need to set diffvc = False
static_paramgen = MLPG(gmm, windows=[(0,0, np.array([1.0]))], diff=False)
paramgen = MLPG(gmm, windows=windows, diff=False)
x = trim_zeros_frames(x)
y = trim_zeros_frames(y)[:,:static_dim]
y_hat1 = static_paramgen.transform(x)[:,:static_dim]
y_hat2 = paramgen.transform(x)
if T_max is not None and len(y) > T_max:
y,y_hat1,y_hat2 = y[:T_max],y_hat1[:T_max],y_hat2[:T_max]
figure(figsize=(16,4*len(which_dims)))
for idx, which_dim in enumerate(which_dims):
subplot(len(which_dims), 1, idx+1)
plot(y[:,which_dim], "--", linewidth=1, label="Target")
plot(y_hat1[:,which_dim], "-", linewidth=1.5, label="w/o MLPG")
plot(y_hat2[:,which_dim], "-", linewidth=2, label="w/ MLPG")
title("{}-th coef".format(which_dim+1), fontsize=16)
legend(prop={"size": 16}, loc="upper right")
idx = 0
which_dims = np.arange(0, static_dim, step=2)
vis_difference(X_aligned[idx], Y_aligned[idx], T_max=300, which_dims=which_dims)