Source code for nnsvs.svs

import json
import time
from copy import deepcopy
from pathlib import Path

import numpy as np
import torch
from hydra.utils import instantiate
from nnmnkwii.io import hts
from nnmnkwii.preprocessing.f0 import interp1d
from nnsvs.gen import (
    postprocess_acoustic,
    postprocess_duration,
    postprocess_waveform,
    predict_acoustic,
    predict_duration,
    predict_timelag,
    predict_waveform,
)
from nnsvs.io.hts import (
    full_to_mono,
    get_note_indices,
    get_pitch_index,
    get_pitch_indices,
    label2phrases,
    label2phrases_str,
    segment_labels,
)
from nnsvs.logger import getLogger
from nnsvs.usfgan import USFGANWrapper
from nnsvs.util import MinMaxScaler, StandardScaler, extract_static_scaler, load_vocoder
from omegaconf import OmegaConf


[docs] class BaseSVS(object): """Base class for singing voice synthesis (SVS) inference All SVS engines should inherit from this class. The input of the SVS engine uses the HTS-style full-context labels. The output should be a tuple of raw waveform and sampling rate. To allow language-independent SVS, this base class does not define the interface for the frontend functionality such as converting musicXML/UST to HTS labels. The frontend processing should be done externally (e.g., using pysinsy or utaupy) or can be implemented with an optional method. """
[docs] def svs(self, labels, *args, **kwargs): """Run SVS inference and returns the synthesized waveform Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels Returns: tuple: (waveform, sampling rate) """ pass
[docs] class SPSVS(BaseSVS): """Statistical parametric singing voice synthesis (SPSVS) Use the ``svs`` method for the simplest inference, or use the separated methods (e.g.,``predict_acoustic`` and ``predict_waveform``) to control each components of the parametric SVS system. Args: model_dir (str): directory of the model device (str): cpu or cuda verbose (int): verbosity level Examples: Synthesize wavefrom from a musicxml file .. plot:: import numpy as np import pysinsy from nnmnkwii.io import hts from nnsvs.pretrained import retrieve_pretrained_model from nnsvs.svs import SPSVS from nnsvs.util import example_xml_file import matplotlib.pyplot as plt # Instantiate the SVS engine model_dir = retrieve_pretrained_model("r9y9/yoko_latest") engine = SPSVS(model_dir) # Extract HTS labels from a MusicXML file contexts = pysinsy.extract_fullcontext(example_xml_file(key="get_over")) labels = hts.HTSLabelFile.create_from_contexts(contexts) # Run inference wav, sr = engine.svs(labels) # Plot the result fig, ax = plt.subplots(figsize=(8,2)) librosa.display.waveshow(wav.astype(np.float32), sr=sr, ax=ax) With WORLD vocoder: >>> wav, sr = engine.svs(labels, vocoder_type="world") With a uSFGAN or SiFiGAN vocoder: >>> wav, sr = engine.svs(labels, vocoder_type="usfgan") """ def __init__(self, model_dir, device="cpu", verbose=0): self.device = device # NOTE: assuming that the logger is instantiated without hydra # needs to add stream handler to the logger explicitly self.logger = getLogger(verbose=verbose, add_stream_handler=True) if isinstance(model_dir, str): model_dir = Path(model_dir) # search for config.yaml assert model_dir / "config.yaml" self.config = OmegaConf.load(model_dir / "config.yaml") self.feature_type = self.config.get("feature_type", "world") self.sample_rate = self.config.get("sample_rate", 48000) # qst self.binary_dict, self.numeric_dict = hts.load_question_set( model_dir / "qst.hed" ) self.pitch_idx = get_pitch_index(self.binary_dict, self.numeric_dict) self.pitch_indices = get_pitch_indices(self.binary_dict, self.numeric_dict) # Time-lag model self.timelag_config = OmegaConf.load(model_dir / "timelag_model.yaml") self.timelag_model = instantiate(self.timelag_config.netG).to(device) checkpoint = torch.load( model_dir / "timelag_model.pth", map_location=device, ) self.timelag_model.load_state_dict(checkpoint["state_dict"]) self.timelag_in_scaler = MinMaxScaler( np.load(model_dir / "in_timelag_scaler_min.npy"), np.load(model_dir / "in_timelag_scaler_scale.npy"), ) self.timelag_out_scaler = StandardScaler( np.load(model_dir / "out_timelag_scaler_mean.npy"), np.load(model_dir / "out_timelag_scaler_var.npy"), np.load(model_dir / "out_timelag_scaler_scale.npy"), ) self.timelag_model.eval() # Duration model self.duration_config = OmegaConf.load(model_dir / "duration_model.yaml") self.duration_model = instantiate(self.duration_config.netG).to(device) checkpoint = torch.load( model_dir / "duration_model.pth", map_location=device, ) self.duration_model.load_state_dict(checkpoint["state_dict"]) self.duration_in_scaler = MinMaxScaler( np.load(model_dir / "in_duration_scaler_min.npy"), np.load(model_dir / "in_duration_scaler_scale.npy"), ) self.duration_out_scaler = StandardScaler( np.load(model_dir / "out_duration_scaler_mean.npy"), np.load(model_dir / "out_duration_scaler_var.npy"), np.load(model_dir / "out_duration_scaler_scale.npy"), ) self.duration_model.eval() # Acoustic model self.acoustic_config = OmegaConf.load(model_dir / "acoustic_model.yaml") self.acoustic_model = instantiate(self.acoustic_config.netG).to(device) checkpoint = torch.load( model_dir / "acoustic_model.pth", map_location=device, ) self.acoustic_model.load_state_dict(checkpoint["state_dict"]) self.acoustic_in_scaler = MinMaxScaler( np.load(model_dir / "in_acoustic_scaler_min.npy"), np.load(model_dir / "in_acoustic_scaler_scale.npy"), ) self.acoustic_out_scaler = StandardScaler( np.load(model_dir / "out_acoustic_scaler_mean.npy"), np.load(model_dir / "out_acoustic_scaler_var.npy"), np.load(model_dir / "out_acoustic_scaler_scale.npy"), ) # NOTE: this is used for GV post-filtering self.acoustic_out_static_scaler = extract_static_scaler( self.acoustic_out_scaler, self.acoustic_config ) # (Optional) lf0 model if (model_dir / "lf0_model.pth").exists(): assert hasattr(self.acoustic_model, "lf0_model") self.logger.info("Loading an external lf0 model.") checkpoint = torch.load( model_dir / "lf0_model.pth", map_location=device, ) self.acoustic_model.lf0_model.load_state_dict(checkpoint["state_dict"]) self.acoustic_model.eval() # Post-filter if (model_dir / "postfilter_model.yaml").exists(): self.postfilter_config = OmegaConf.load(model_dir / "postfilter_model.yaml") self.postfilter_model = instantiate(self.postfilter_config.netG).to(device) checkpoint = torch.load( model_dir / "postfilter_model.pth", map_location=device, ) self.postfilter_model.load_state_dict(checkpoint["state_dict"]) self.postfilter_model.eval() self.postfilter_out_scaler = StandardScaler( np.load(model_dir / "out_postfilter_scaler_mean.npy"), np.load(model_dir / "out_postfilter_scaler_var.npy"), np.load(model_dir / "out_postfilter_scaler_scale.npy"), ) else: self.postfilter_model = None self.postfilter_config = None self.postfilter_out_scaler = None # Vocoder model if (model_dir / "vocoder_model.pth").exists(): self.vocoder, self.vocoder_in_scaler, self.vocoder_config = load_vocoder( model_dir / "vocoder_model.pth", device, self.acoustic_config ) else: self.logger.info( "No trained vocoder model found. WORLD vocoder will be used." ) self.vocoder = None self.vocoder_config = None self.vocoder_in_scaler = None def __repr__(self): timelag_str = json.dumps( OmegaConf.to_container(self.timelag_config.netG), sort_keys=False, indent=4, ) duration_str = json.dumps( OmegaConf.to_container(self.duration_config.netG), sort_keys=False, indent=4, ) acoustic_str = json.dumps( OmegaConf.to_container(self.acoustic_config.netG), sort_keys=False, indent=4, ) repr = f"""Statistical parametric SVS (sampling rate: {self.sample_rate}) Time-lag model: {timelag_str} Duration model: {duration_str} Acoustic model: {acoustic_str} """ if self.postfilter_model is not None: postfilter_str = json.dumps( OmegaConf.to_container(self.postfilter_config.netG), sort_keys=False, indent=4, ) repr += f"Post-filter model: {postfilter_str}\n" else: repr += "Post-filter model: None\n" if self.vocoder is not None: if ( "generator" in self.vocoder_config and "discriminator" in self.vocoder_config ): # usfgan vocoder_params = OmegaConf.to_container( self.vocoder_config["generator"], throw_on_missing=True ) else: vocoder_params = { "generator_type": self.vocoder_config.get( "generator_type", "ParallelWaveGANGenerator" # type: ignore ), "generator_params": OmegaConf.to_container( self.vocoder_config.generator_params ), } vocoder_str = json.dumps( vocoder_params, sort_keys=False, indent=4, ) repr += f"Vocoder model: {vocoder_str}\n" else: repr += "Vocoder model: WORLD\n" return repr
[docs] def set_device(self, device): """Set device for the SVS model Args: device (str): cpu or cuda. """ self.logger.info(f"Set device to {device}") self.device = device self.timelag_model.to(device) self.duration_model.to(device) self.acoustic_model.to(device) self.postfilter_model.to(device) if self.postfilter_model is not None else None self.vocoder.to(device) if self.vocoder is not None else None
[docs] def predict_timelag(self, labels): """Predict time-ag from HTS labels Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels. Returns: ndarray: Predicted time-lag. """ start_time = time.time() lag = predict_timelag( self.device, labels, timelag_model=self.timelag_model, timelag_config=self.timelag_config, timelag_in_scaler=self.timelag_in_scaler, timelag_out_scaler=self.timelag_out_scaler, binary_dict=self.binary_dict, numeric_dict=self.numeric_dict, pitch_indices=self.pitch_indices, log_f0_conditioning=self.config.log_f0_conditioning, allowed_range=self.config.timelag.allowed_range, allowed_range_rest=self.config.timelag.allowed_range_rest, force_clip_input_features=self.config.timelag.force_clip_input_features, frame_period=self.config.frame_period, ) self.logger.info( f"Elapsed time for time-lag prediction: {time.time() - start_time:.3f} sec" ) return lag
[docs] def predict_duration(self, labels): """Predict durations from HTS labels Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels. Returns: ndarray: Predicted durations. """ start_time = time.time() durations = predict_duration( self.device, labels, duration_model=self.duration_model, duration_config=self.duration_config, duration_in_scaler=self.duration_in_scaler, duration_out_scaler=self.duration_out_scaler, binary_dict=self.binary_dict, numeric_dict=self.numeric_dict, pitch_indices=self.pitch_indices, log_f0_conditioning=self.config.log_f0_conditioning, force_clip_input_features=self.config.duration.force_clip_input_features, frame_period=self.config.frame_period, ) self.logger.info( f"Elapsed time for duration prediction: {time.time() - start_time:.3f} sec" ) return durations
[docs] def postprocess_duration(self, labels, pred_durations, lag): """Post-process durations Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels. pred_durations (ndarray): Predicted durations. lag (ndarray): Predicted time-lag. Returns: nnmnkwii.io.hts.HTSLabelFile: duration modified HTS labels. """ start_time = time.time() duration_modified_labels = postprocess_duration( labels, pred_durations, lag, frame_period=self.config.frame_period ) self.logger.info( f"Elapsed time for duration post-processing: {time.time() - start_time:.3f} sec" ) return duration_modified_labels
[docs] def predict_timing(self, labels): """Predict timing from HTS labels Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels. Returns: nnmnkwii.io.hts.HTSLabelFile: duration modified HTS labels. """ lag = self.predict_timelag(labels) durations = self.predict_duration(labels) duration_modified_full_labels = self.postprocess_duration( labels, durations, lag ) return duration_modified_full_labels
[docs] def predict_acoustic(self, duration_modified_labels, f0_shift_in_cent=0): """Predict acoustic features from HTS labels Args: duration_modified_labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels. f0_shift_in_cent (float): F0 shift in cent. Returns: ndarray: Predicted acoustic features. """ start_time = time.time() acoustic_features = predict_acoustic( device=self.device, labels=duration_modified_labels, acoustic_model=self.acoustic_model, acoustic_config=self.acoustic_config, acoustic_in_scaler=self.acoustic_in_scaler, acoustic_out_scaler=self.acoustic_out_scaler, binary_dict=self.binary_dict, numeric_dict=self.numeric_dict, subphone_features=self.acoustic_config.get( "subphone_features", "coarse_coding" ), pitch_indices=self.pitch_indices, log_f0_conditioning=self.config.log_f0_conditioning, force_clip_input_features=self.acoustic_config.get( "force_clip_input_features", True ), frame_period=self.config.frame_period, f0_shift_in_cent=f0_shift_in_cent, ) self.logger.info( f"Elapsed time for acoustic feature prediction: {time.time() - start_time:.3f} sec" ) # log real-time factor (RT) RT = (time.time() - start_time) / ( acoustic_features.shape[0] * self.config.frame_period / 1000 ) self.logger.info(f"Real-time factor for acoustic feature prediction: {RT:.3f}") return acoustic_features
[docs] def postprocess_acoustic( self, duration_modified_labels, acoustic_features, post_filter_type="gv", trajectory_smoothing=True, trajectory_smoothing_cutoff=50, trajectory_smoothing_cutoff_f0=20, vuv_threshold=0.5, force_fix_vuv=False, fill_silence_to_rest=False, f0_shift_in_cent=0, ): """Post-process acoustic features The function converts acoustic features in single ndarray to tuple of multi-stream acoustic features. e.g., array -> (mgc, lf0, vuv, bap) If post_filter_type=``nnsvs`` is specified, learned post-filter is applied. However, it is recommended to use ``gv`` in general. Args: duration_modified_labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels. acoustic_features (ndarray): Predicted acoustic features. post_filter_type (str): Post-filter type. One of ``gv``, ``merlin`` or ``nnsvs``. Recommended to use ``gv`` for general purpose. trajectory_smoothing (bool): Whether to apply trajectory smoothing. trajectory_smoothing_cutoff (float): Cutoff frequency for trajectory smoothing of spectral features. trajectory_smoothing_cutoff_f0 (float): Cutoff frequency for trajectory smoothing of f0. vuv_threshold (float): V/UV threshold. force_fix_vuv (bool): Force fix V/UV. fill_silence_to_rest (bool): Fill silence to rest frames. f0_shift_in_cent (float): F0 shift in cent. Returns: tuple: Post-processed multi-stream acoustic features. """ start_time = time.time() multistream_features = postprocess_acoustic( device=self.device, duration_modified_labels=duration_modified_labels, acoustic_features=acoustic_features, binary_dict=self.binary_dict, numeric_dict=self.numeric_dict, acoustic_config=self.acoustic_config, acoustic_out_static_scaler=self.acoustic_out_static_scaler, postfilter_model=self.postfilter_model, postfilter_config=self.postfilter_config, postfilter_out_scaler=self.postfilter_out_scaler, sample_rate=self.sample_rate, frame_period=self.config.frame_period, relative_f0=self.config.acoustic.relative_f0, feature_type=self.feature_type, post_filter_type=post_filter_type, trajectory_smoothing=trajectory_smoothing, trajectory_smoothing_cutoff=trajectory_smoothing_cutoff, trajectory_smoothing_cutoff_f0=trajectory_smoothing_cutoff_f0, vuv_threshold=vuv_threshold, f0_shift_in_cent=f0_shift_in_cent, vibrato_scale=1.0, # only valid for Sinsy-like models force_fix_vuv=force_fix_vuv, fill_silence_to_rest=fill_silence_to_rest, ) self.logger.info( f"Elapsed time for acoustic post-processing: {time.time() - start_time:.3f} sec" ) return multistream_features
[docs] def predict_waveform( self, multistream_features, vocoder_type="world", vuv_threshold=0.5, ): """Predict waveform from acoustic features Args: multistream_features (tuple): Multi-stream acoustic features. vocoder_type (str): Vocoder type. One of ``world``, ``pwg`` or ``usfgan``. If ``auto`` is specified, the vocoder is automatically selected. vuv_threshold (float): V/UV threshold. Returns: ndarray: Predicted waveform. """ start_time = time.time() if vocoder_type in ["pwg", "usfgan"] and self.vocoder is None: raise ValueError( """Pre-trained vocodr model is not found. WORLD is only supported for waveform generation""" ) if vocoder_type == "auto": if self.feature_type == "melf0": assert self.vocoder is not None vocoder_type = ( "usfgan" if isinstance(self.vocoder, USFGANWrapper) else "pwg" ) elif self.feature_type == "world": if self.vocoder is None: vocoder_type = "world" else: vocoder_type = ( "usfgan" if isinstance(self.vocoder, USFGANWrapper) else "pwg" ) wav = predict_waveform( device=self.device, multistream_features=multistream_features, vocoder=self.vocoder, vocoder_config=self.vocoder_config, vocoder_in_scaler=self.vocoder_in_scaler, sample_rate=self.sample_rate, frame_period=self.config.frame_period, use_world_codec=self.config.get("use_world_codec", False), feature_type=self.feature_type, vocoder_type=vocoder_type, vuv_threshold=vuv_threshold, ) self.logger.info( f"Elapsed time for waveform generation: {time.time() - start_time:.3f} sec" ) RT = (time.time() - start_time) / (len(wav) / self.sample_rate) self.logger.info(f"Real-time factor for waveform generation: {RT:.3f}") return wav
[docs] def postprocess_waveform( self, wav, dtype=np.int16, peak_norm=False, loudness_norm=False, target_loudness=-20, ): """Post-process waveform Args: wav (ndarray): Waveform. dtype (dtype): Data type of waveform. peak_norm (bool): Whether to apply peak normalization. loudness_norm (bool): Whether to apply loudness normalization. target_loudness (float): Target loudness in dB. Returns: ndarray: Post-processed waveform. """ start_time = time.time() wav = postprocess_waveform( wav=wav, sample_rate=self.sample_rate, dtype=dtype, peak_norm=peak_norm, loudness_norm=loudness_norm, target_loudness=target_loudness, ) self.logger.info( f"Elapsed time for waveform post-processing: {time.time() - start_time:.3f} sec" ) return wav
[docs] def svs( self, labels, vocoder_type="world", post_filter_type="gv", trajectory_smoothing=True, trajectory_smoothing_cutoff=50, trajectory_smoothing_cutoff_f0=20, vuv_threshold=0.5, style_shift=0, force_fix_vuv=False, fill_silence_to_rest=False, dtype=np.int16, peak_norm=False, loudness_norm=False, target_loudness=-20, segmented_synthesis=False, ): """Synthesize waveform from HTS labels. Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels vocoder_type (str): Vocoder type. One of ``world``, ``pwg`` or ``usfgan``. If ``auto`` is specified, the vocoder is automatically selected. post_filter_type (str): Post-filter type. ``merlin``, ``gv`` or ``nnsvs`` is supported. trajectory_smoothing (bool): Whether to smooth acoustic feature trajectory. trajectory_smoothing_cutoff (int): Cutoff frequency for trajectory smoothing. trajectory_smoothing_cutoff_f0 (int): Cutoff frequency for trajectory smoothing of f0. vuv_threshold (float): Threshold for VUV. style_shift (int): style shift parameter force_fix_vuv (bool): Whether to correct VUV. fill_silence_to_rest (bool): Fill silence to rest frames. dtype (np.dtype): Data type of the output waveform. peak_norm (bool): Whether to normalize the waveform by peak value. loudness_norm (bool): Whether to normalize the waveform by loudness. target_loudness (float): Target loudness in dB. segmneted_synthesis (bool): Whether to use segmented synthesis. """ start_time = time.time() vocoder_type = vocoder_type.lower() if vocoder_type not in ["world", "pwg", "usfgan", "auto"]: raise ValueError(f"Unknown vocoder type: {vocoder_type}") if post_filter_type not in ["merlin", "nnsvs", "gv", "none"]: raise ValueError(f"Unknown post-filter type: {post_filter_type}") # Predict timinigs duration_modified_labels = self.predict_timing(labels) # NOTE: segmented synthesis is not well tested. There MUST be better ways # to do this. if segmented_synthesis: self.logger.warning( "Segmented synthesis is not well tested. Use it on your own risk." ) duration_modified_labels_segs = segment_labels( duration_modified_labels, # the following parameters are based on experiments in the NNSVS's paper # tuned with Namine Ritsu's database silence_threshold=0.1, min_duration=5.0, force_split_threshold=5.0, ) from tqdm.auto import tqdm else: duration_modified_labels_segs = [duration_modified_labels] def tqdm(x, **kwargs): return x # Run acoustic model and vocoder hts_frame_shift = int(self.config.frame_period * 1e4) wavs = [] self.logger.info(f"Number of segments: {len(duration_modified_labels_segs)}") for duration_modified_labels_seg in tqdm( duration_modified_labels_segs, desc="[segment]", total=len(duration_modified_labels_segs), ): duration_modified_labels_seg.frame_shift = hts_frame_shift # Predict acoustic features # NOTE: if non-zero pre_f0_shift_in_cent is specified, the input pitch # will be shifted before running the acoustic model acoustic_features = self.predict_acoustic( duration_modified_labels_seg, f0_shift_in_cent=style_shift * 100, ) # Post-processing for acoustic features # NOTE: if non-zero post_f0_shift_in_cent is specified, the output pitch # will be shifted as a part of post-processing multistream_features = self.postprocess_acoustic( acoustic_features=acoustic_features, duration_modified_labels=duration_modified_labels_seg, trajectory_smoothing=trajectory_smoothing, trajectory_smoothing_cutoff=trajectory_smoothing_cutoff, trajectory_smoothing_cutoff_f0=trajectory_smoothing_cutoff_f0, force_fix_vuv=force_fix_vuv, fill_silence_to_rest=fill_silence_to_rest, f0_shift_in_cent=-style_shift * 100, ) # Generate waveform by vocoder wav = self.predict_waveform( multistream_features=multistream_features, vocoder_type=vocoder_type, vuv_threshold=vuv_threshold, ) wavs.append(wav) # Concatenate segmented waveforms wav = np.concatenate(wavs, axis=0).reshape(-1) # Post-processing for the output waveform wav = self.postprocess_waveform( wav, dtype=dtype, peak_norm=peak_norm, loudness_norm=loudness_norm, target_loudness=target_loudness, ) self.logger.info(f"Total time: {time.time() - start_time:.3f} sec") RT = (time.time() - start_time) / (len(wav) / self.sample_rate) self.logger.info(f"Total real-time factor: {RT:.3f}") return wav, self.sample_rate
def _warn_if_model_is_old(logger): logger.warning( """It is likely you have trained you model with old NNSVS. It is recommended to retrain your model with the latest version of NNSVS.""" )
[docs] class NEUTRINO(SPSVS): """NEUTRINO-like interface for singing voice synthesis Args: model_dir (str): model directory device (str): device name verbose (int): verbose level """ def __init__(self, model_dir, device="cpu", verbose=0): super().__init__(model_dir, device=device, verbose=verbose) if self.feature_type != "world": raise RuntimeError(f"Unsupported feature type: {self.feature_type}") if not self.config.get("use_world_codec", False): self.logger.warning( "WORLD coded is required to output NEUTRIN-compatible features" ) _warn_if_model_is_old(self.logger)
[docs] @classmethod def musicxml2label(cls, input_file): """Convert musicXML to full and mono HTS labels Args: input_file (str): musicXML file """ import pysinsy contexts = pysinsy.extract_fullcontext(input_file) full_labels = hts.HTSLabelFile.create_from_contexts(contexts) mono_labels = full_to_mono(full_labels) return full_labels, mono_labels
[docs] def get_num_phrases(self, labels): """Get number of phrases Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS label Returns: int: number of phrases """ phrases = label2phrases(labels) return len(phrases)
[docs] def get_phraselist(self, full_labels, timing_labels): """Get phraselit from full and timing HTS labels Args: full_labels (nnmnkwii.io.hts.HTSLabelFile): full HTS label timing_labels (nnmnkwii.io.hts.HTSLabelFile): timing HTS label Returns: str: phraselist """ note_indices = get_note_indices(full_labels) phraselist = label2phrases_str(timing_labels, note_indices) return phraselist
[docs] def predict_acoustic( self, full_labels, timing_labels=None, style_shift=0, phrase_num=-1, trajectory_smoothing=True, trajectory_smoothing_cutoff=50, trajectory_smoothing_cutoff_f0=20, vuv_threshold=0.5, force_fix_vuv=False, fill_silence_to_rest=False, ): """Main inference of timing and acoustic predictions Args: full_labels (nnmnkwii.io.hts.HTSLabelFile): full HTS label timing_labels (nnmnkwii.io.hts.HTSLabelFile): timing HTS label style_shift (int): style shift parameter phrase_num (int): phrase number to use for inference trajectory_smoothing (bool): whether to apply trajectory smoothing trajectory_smoothing_cutoff (float): cutoff frequency for trajectory smoothing trajectory_smoothing_cutoff_f0 (float): cutoff frequency for trajectory smoothing for f0 vuv_threshold (float): V/UV threshold force_fix_vuv (bool): whether to force fix V/UV fill_silence_to_rest (bool): Fill silence to rest frames. Returns: tuple: (f0, mgc, bap) """ if timing_labels is None: self.logger.warning("'timing_labels' is not provided.") # Run timing prediction duration_modified_full_labels = self.predict_timing(full_labels) timing_labels = full_to_mono(duration_modified_full_labels) else: # Load pre-estimated timing duration_modified_full_labels = deepcopy(full_labels) duration_modified_full_labels.start_times = timing_labels.start_times.copy() duration_modified_full_labels.end_times = timing_labels.end_times.copy() if phrase_num >= 0: phrases = label2phrases(duration_modified_full_labels) if phrase_num > len(phrases): raise RuntimeError( f"phrase_num is too large: {phrase_num} > {len(phrases)}" ) # Use the specified phrase for inference duration_modified_full_labels = phrases[phrase_num] self.logger.info(f"Using phrase {phrase_num}/{len(phrases)} for inference") # Predict acoustic features # NOTE: if non-zero pre_f0_shift_in_cent is specified, the input pitch # will be shifted before running the acoustic model acoustic_features = super().predict_acoustic( duration_modified_full_labels, f0_shift_in_cent=style_shift * 100, ) # Post-processing for acoustic features # NOTE: if non-zero post_f0_shift_in_cent is specified, the output pitch # will be shifted as a part of post-processing multistream_features = super().postprocess_acoustic( acoustic_features=acoustic_features, duration_modified_labels=duration_modified_full_labels, trajectory_smoothing=trajectory_smoothing, trajectory_smoothing_cutoff=trajectory_smoothing_cutoff, trajectory_smoothing_cutoff_f0=trajectory_smoothing_cutoff_f0, vuv_threshold=vuv_threshold, force_fix_vuv=force_fix_vuv, fill_silence_to_rest=fill_silence_to_rest, f0_shift_in_cent=-style_shift * 100, ) assert len(multistream_features) == 4 mgc, lf0, vuv, bap = multistream_features if not self.config.get("use_world_codec", False): self.logger.warning( """use_world_codec is not set. WORLD (NEUTRINO edition) does not work with the output of this model. """ ) # Convert lf0 to f0 f0 = np.exp(lf0.copy()) f0[vuv < vuv_threshold] = 0 # NOTE: Neutrino-compatible MGC should have negative values at the 0-th coefficient. if mgc[:, 0].mean() > 0: self.logger.warning("MGC 0-th coefficient is positive.") _warn_if_model_is_old(self.logger) # Make sure to have correct array layout and dtype # These parameters can be used to generate waveform by WORLD f0 = np.ascontiguousarray(f0).astype(np.float64) mgc = np.ascontiguousarray(mgc).astype(np.float64) bap = np.ascontiguousarray(bap).astype(np.float64) return f0, mgc, bap
[docs] def predict_waveform( self, f0, mgc, bap, vocoder_type="world", vuv_threshold=0.5, dtype=np.int16, peak_norm=False, loudness_norm=False, target_loudness=-20, ): """Generate waveform from acoustic features Args: f0 (ndarray): f0 mgc (ndarray): mel-cepstrum bap (ndarray): band-aperiodicity vocoder_type (str): vocoder type vuv_threshold (float): V/UV threshold dtype (np.dtype): Data type of the output waveform. peak_norm (bool): Whether to normalize the waveform by peak value. loudness_norm (bool): Whether to normalize the waveform by loudness. target_loudness (float): Target loudness in dB. Returns: ndarray: waveform """ # Convert NEUTRINO-like features to NNSVS's one # (f0, mgc, bap) -> (mgc, lf0, vuv, bap) vuv = (f0 > 0).astype(np.float64).reshape(-1, 1) lf0 = f0.copy() lf0[np.nonzero(lf0)] = np.log(f0[np.nonzero(lf0)]) lf0 = interp1d(lf0, kind="slinear") multistream_features = (mgc, lf0, vuv, bap) wav = super().predict_waveform( multistream_features=multistream_features, vocoder_type=vocoder_type, vuv_threshold=vuv_threshold, ) wav = self.postprocess_waveform( wav, dtype=dtype, peak_norm=peak_norm, loudness_norm=loudness_norm, target_loudness=target_loudness, ) return wav
[docs] def svs(self, labels): """Synthesize wavefrom from HTS labels Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels Returns: tuple: (waveform, sample_rate) """ self.logger.warning( "Use `predict_acoustic` and `predict_waveform` methods instead." ) f0, mgc, bap = self.predict_acoustic(labels) wav = self.predict_waveform(f0, mgc, bap) return wav, self.sample_rate