from functools import partial
from nnsvs.acoustic_models.multistream import (
MDNMultistreamSeparateF0MelModel,
MultistreamSeparateF0MelModel,
MultistreamSeparateF0ParametricModel,
NPSSMDNMultistreamParametricModel,
NPSSMultistreamParametricModel,
)
from nnsvs.acoustic_models.sinsy import ResSkipF0FFConvLSTM
from nnsvs.acoustic_models.tacotron import (
BiLSTMMDNNonAttentiveDecoder,
BiLSTMNonAttentiveDecoder,
MDNNonAttentiveDecoder,
NonAttentiveDecoder,
)
from nnsvs.acoustic_models.tacotron_f0 import (
BiLSTMResF0NonAttentiveDecoder,
MDNResF0NonAttentiveDecoder,
ResF0NonAttentiveDecoder,
)
from nnsvs.acoustic_models.util import predict_lf0_with_residual
from nnsvs.base import BaseModel, PredictionType
from nnsvs.layers.conv import ResnetBlock, WNConv1d
from nnsvs.mdn import MDNLayer, mdn_get_most_probable_sigma_and_mu
from nnsvs.model import TransformerEncoder, VariancePredictor
from nnsvs.util import init_weights
from torch import nn
__all__ = [
# Non-autoregressive models
"ResF0Conv1dResnet",
"ResSkipF0FFConvLSTM",
"ResF0VariancePredictor",
"ResF0TransformerEncoder",
# Autoregressive models
"NonAttentiveDecoder",
"MDNNonAttentiveDecoder",
"BiLSTMNonAttentiveDecoder",
"BiLSTMMDNNonAttentiveDecoder",
"ResF0NonAttentiveDecoder",
"MDNResF0NonAttentiveDecoder",
"BiLSTMResF0NonAttentiveDecoder",
# Multi-stream models
"MultistreamSeparateF0ParametricModel",
"NPSSMDNMultistreamParametricModel",
"NPSSMultistreamParametricModel",
"MultistreamSeparateF0MelModel",
"MDNMultistreamSeparateF0MelModel",
]
[docs]
class ResF0Conv1dResnet(BaseModel):
"""Conv1d + Resnet + Residual F0 prediction
Residual F0 prediction is inspired by :cite:t:`hono2021sinsy`.
Args:
in_dim (int): input dimension
hidden_dim (int): hidden dimension
out_dim (int): output dimension
num_layers (int): number of layers
in_lf0_idx (int): index of lf0 in input features
in_lf0_min (float): minimum value of lf0 in the training data of input features
in_lf0_max (float): maximum value of lf0 in the training data of input features
out_lf0_idx (int): index of lf0 in output features. Typically 180.
out_lf0_mean (float): mean of lf0 in the training data of output features
out_lf0_scale (float): scale of lf0 in the training data of output features
init_type (str): initialization type
use_mdn (bool): whether to use MDN or not
num_gaussians (int): number of gaussians in MDN
dim_wise (bool): whether to use dimension-wise MDN or not
"""
def __init__(
self,
in_dim,
hidden_dim,
out_dim,
num_layers=4,
# NOTE: you must carefully set the following parameters
in_lf0_idx=300,
in_lf0_min=5.3936276,
in_lf0_max=6.491111,
out_lf0_idx=180,
out_lf0_mean=5.953093881972361,
out_lf0_scale=0.23435173188961034,
init_type="none",
use_mdn=False,
num_gaussians=8,
dim_wise=False,
):
super().__init__()
self.in_lf0_idx = in_lf0_idx
self.in_lf0_min = in_lf0_min
self.in_lf0_max = in_lf0_max
self.out_lf0_idx = out_lf0_idx
self.out_lf0_mean = out_lf0_mean
self.out_lf0_scale = out_lf0_scale
self.use_mdn = use_mdn
model = [
nn.ReflectionPad1d(3),
WNConv1d(in_dim, hidden_dim, kernel_size=7, padding=0),
]
for n in range(num_layers):
model.append(ResnetBlock(hidden_dim, dilation=2 ** n))
last_conv_out_dim = hidden_dim if use_mdn else out_dim
model += [
nn.LeakyReLU(0.2),
nn.ReflectionPad1d(3),
WNConv1d(hidden_dim, last_conv_out_dim, kernel_size=7, padding=0),
]
self.model = nn.Sequential(*model)
if self.use_mdn:
self.mdn_layer = MDNLayer(
in_dim=hidden_dim,
out_dim=out_dim,
num_gaussians=num_gaussians,
dim_wise=dim_wise,
)
else:
self.mdn_layer = None
init_weights(self, init_type)
def prediction_type(self):
return (
PredictionType.PROBABILISTIC
if self.use_mdn
else PredictionType.DETERMINISTIC
)
def has_residual_lf0_prediction(self):
return True
def forward(self, x, lengths=None, y=None):
"""Forward step
Args:
x (torch.Tensor): input features
lengths (torch.Tensor): lengths of input features
y (torch.Tensor): output features
Returns:
tuple: (output features, residual log-F0)
"""
out = self.model(x.transpose(1, 2)).transpose(1, 2)
if self.use_mdn:
log_pi, log_sigma, mu = self.mdn_layer(out)
else:
mu = out
lf0_pred, lf0_residual = predict_lf0_with_residual(
x,
mu,
self.in_lf0_idx,
self.in_lf0_min,
self.in_lf0_max,
self.out_lf0_idx,
self.out_lf0_mean,
self.out_lf0_scale,
)
# Inject the predicted lf0 into the output features
if self.use_mdn:
mu[:, :, :, self.out_lf0_idx] = lf0_pred
else:
mu[:, :, self.out_lf0_idx] = lf0_pred.squeeze(-1)
if self.use_mdn:
return (log_pi, log_sigma, mu), lf0_residual
else:
return mu, lf0_residual
def inference(self, x, lengths=None):
"""Inference step
Args:
x (torch.Tensor): input features
lengths (torch.Tensor): lengths of input features
Returns:
tuple: (mu, sigma) if use_mdn, (output, ) otherwise
"""
if self.use_mdn:
(log_pi, log_sigma, mu), _ = self(x, lengths)
sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu)
return mu, sigma
else:
return self(x, lengths)[0]
# Will be removed in v0.1.0
ResF0Conv1dResnetMDN = partial(ResF0Conv1dResnet, use_mdn=True)
[docs]
class ResF0VariancePredictor(VariancePredictor):
"""Variance predictor in :cite:t:`ren2020fastspeech` with residual F0 prediction
Args:
in_dim (int): the input dimension
out_dim (int): the output dimension
num_layers (int): the number of layers
hidden_dim (int): the hidden dimension
kernel_size (int): the kernel size
dropout (float): the dropout rate
in_lf0_idx (int): the index of the input LF0
in_lf0_min (float): the minimum value of the input LF0
in_lf0_max (float): the maximum value of the input LF0
out_lf0_idx (int): the index of the output LF0
out_lf0_mean (float): the mean value of the output LF0
out_lf0_scale (float): the scale value of the output LF0
init_type (str): the initialization type
use_mdn (bool): whether to use MDN or not
num_gaussians (int): the number of gaussians
dim_wise (bool): whether to use dim-wise or not
"""
def __init__(
self,
in_dim,
out_dim,
num_layers=5,
hidden_dim=256,
kernel_size=5,
dropout=0.5,
init_type="none",
use_mdn=False,
num_gaussians=1,
dim_wise=False,
# NOTE: you must carefully set the following parameters
in_lf0_idx=300,
in_lf0_min=5.3936276,
in_lf0_max=6.491111,
out_lf0_idx=180,
out_lf0_mean=5.953093881972361,
out_lf0_scale=0.23435173188961034,
):
super().__init__(
in_dim=in_dim,
out_dim=out_dim,
num_layers=num_layers,
hidden_dim=hidden_dim,
kernel_size=kernel_size,
dropout=dropout,
init_type=init_type,
use_mdn=use_mdn,
num_gaussians=num_gaussians,
dim_wise=dim_wise,
)
self.in_lf0_idx = in_lf0_idx
self.in_lf0_min = in_lf0_min
self.in_lf0_max = in_lf0_max
self.out_lf0_idx = out_lf0_idx
self.out_lf0_mean = out_lf0_mean
self.out_lf0_scale = out_lf0_scale
def has_residual_lf0_prediction(self):
return True
def forward(self, x, lengths=None, y=None):
"""Forward step
Args:
x (torch.Tensor): input features
lengths (torch.Tensor): lengths of input features
y (torch.Tensor): output features
Returns:
tuple: (output features, residual log-F0)
"""
out = super().forward(x, lengths, y)
if self.use_mdn:
log_pi, log_sigma, mu = out
else:
mu = out
lf0_pred, lf0_residual = predict_lf0_with_residual(
x,
mu,
self.in_lf0_idx,
self.in_lf0_min,
self.in_lf0_max,
self.out_lf0_idx,
self.out_lf0_mean,
self.out_lf0_scale,
)
# Inject the predicted lf0 into the output features
if self.use_mdn:
mu[:, :, :, self.out_lf0_idx] = lf0_pred
else:
mu[:, :, self.out_lf0_idx] = lf0_pred.squeeze(-1)
if self.use_mdn:
return (log_pi, log_sigma, mu), lf0_residual
else:
return mu, lf0_residual
def inference(self, x, lengths=None):
"""Inference step
Args:
x (torch.Tensor): input features
lengths (torch.Tensor): lengths of input features
Returns:
tuple: (mu, sigma) if use_mdn, (output, ) otherwise
"""
if self.use_mdn:
(log_pi, log_sigma, mu), _ = self(x, lengths)
sigma, mu = mdn_get_most_probable_sigma_and_mu(log_pi, log_sigma, mu)
return mu, sigma
else:
return self(x, lengths)[0]
class ResF0TransformerEncoder(BaseModel):
"""Transformer encoder with residual f0 prediction"""
def __init__(
self,
in_dim,
out_dim,
hidden_dim,
attention_dim,
num_heads=2,
num_layers=2,
kernel_size=3,
dropout=0.1,
reduction_factor=1,
init_type="none",
downsample_by_conv=False,
# NOTE: you must carefully set the following parameters
in_lf0_idx=300,
in_lf0_min=5.3936276,
in_lf0_max=6.491111,
out_lf0_idx=180,
out_lf0_mean=5.953093881972361,
out_lf0_scale=0.23435173188961034,
):
super().__init__()
self.in_lf0_idx = in_lf0_idx
self.in_lf0_min = in_lf0_min
self.in_lf0_max = in_lf0_max
self.out_lf0_idx = out_lf0_idx
self.out_lf0_mean = out_lf0_mean
self.out_lf0_scale = out_lf0_scale
self.reduction_factor = reduction_factor
self.encoder = TransformerEncoder(
in_dim=in_dim,
out_dim=out_dim,
hidden_dim=hidden_dim,
attention_dim=attention_dim,
num_heads=num_heads,
num_layers=num_layers,
kernel_size=kernel_size,
dropout=dropout,
reduction_factor=reduction_factor,
init_type=init_type,
downsample_by_conv=downsample_by_conv,
)
def has_residual_lf0_prediction(self):
return True
def forward(self, x, lengths=None, y=None):
"""Forward pass
Args:
x (torch.Tensor): input tensor
lengths (torch.Tensor): input sequence lengths
y (torch.Tensor): target tensor (optional)
Returns:
torch.Tensor: output tensor
"""
outs = self.encoder(x, lengths)
lf0_pred, lf0_residual = predict_lf0_with_residual(
x,
outs,
self.in_lf0_idx,
self.in_lf0_min,
self.in_lf0_max,
self.out_lf0_idx,
self.out_lf0_mean,
self.out_lf0_scale,
)
outs[:, :, self.out_lf0_idx] = lf0_pred.squeeze(-1)
return outs, lf0_residual
def inference(self, x, lengths):
return self(x, lengths)[0]
def LSTMEncoder(*args, **kwargs):
from nnsvs.model import LSTMEncoder
return LSTMEncoder(*args, **kwargs)