Source code for nnsvs.io.hts

import re
from copy import deepcopy

import numpy as np
from nnmnkwii.io import hts

_flag_re = re.compile(r"\^([A-Za-z0-9]+)\_")


def full_to_mono(labels):
    """Convert full-context labels to mono labels

    Args:
        labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels

    Returns:
        nnmnkwii.io.hts.HTSLabelFile: Mono HTS labels
    """
    is_full_context = "@" in labels.contexts[0]
    if not is_full_context:
        return labels

    mono_labels = deepcopy(labels)
    mono_labels.contexts = [c.split("-")[1].split("+")[0] for c in labels.contexts]

    return mono_labels


[docs] def get_note_frame_indices(binary_dict, numeric_dict, in_feats): """Get note frame indices from frame-level input features Note that the F0 in the input features must be discrete F0. Args: binary_dict (dict): Dictionary of binary features numeric_dict (dict): Dictionary of numeric features in_feats (np.ndarray): Input features Returns: np.ndarray: Note frame indices """ pitch_idx = get_pitch_index(binary_dict, numeric_dict) score_f0 = in_feats[:, pitch_idx] note_frame_indices = np.where(score_f0 > 0)[0] return note_frame_indices
[docs] def get_pitch_index(binary_dict, numeric_dict): """Get pitch index from binary and numeric feature dictionaries Args: binary_dict (dict): Dictionary of binary features numeric_dict (dict): Dictionary of numeric features Returns: int: Pitch index """ idx = 0 pitch_idx = len(binary_dict) while idx < len(numeric_dict): if numeric_dict[idx][1].pattern.startswith("/E"): pitch_idx = pitch_idx + idx break idx += 1 return pitch_idx
[docs] def get_pitch_indices(binary_dict, numeric_dict): """Get pitch indices from binary and numeric feature dictionaries Args: binary_dict (dict): Dictionary of binary features numeric_dict (dict): Dictionary of numeric features Returns: list: Pitch indices """ idx = 0 pitch_idx = len(binary_dict) assert np.any( [numeric_dict[idx][1].pattern.startswith(p) for p in ["/D", "/E", "/F"]] ) pitch_indices = [pitch_idx] while True: idx += 1 if np.any( [numeric_dict[idx][1].pattern.startswith(p) for p in ["/D", "/E", "/F"]] ): pitch_indices.append(pitch_idx + idx) else: break return pitch_indices
[docs] def get_note_indices(labels): """Get note start indices from HTS labels Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels Returns: list: Note start indices """ note_indices = [0] last_start_time = labels.start_times[0] for idx in range(1, len(labels)): if labels.start_times[idx] != last_start_time: note_indices.append(idx) last_start_time = labels.start_times[idx] else: pass return note_indices
def merge_sil(labels): N = len(labels) f = hts.HTSLabelFile() f.append(labels[0], strict=False) is_full_context = "@" in labels[0][-1] for i in range(1, N): if (is_full_context and "-sil" in f[-1][-1] and "-sil" in labels[i][-1]) or ( not is_full_context and f[-1][-1] == "sil" and labels[i][-1] == "sil" ): # extend sil f.end_times[-1] = labels[i][1] else: f.append(labels[i], strict=False) return f def _is_br(label): is_full_context = "@" in label if is_full_context: is_br = "-br" in label else: is_br = label == "br" return is_br def _is_silence(label): is_full_context = "@" in label if is_full_context: is_silence = "-sil" in label or "-pau" in label else: is_silence = label == "sil" or label == "pau" return is_silence def compute_nosil_duration(labels, threshold=5.0): is_full_context = "@" in labels[0][-1] sum_d = 0 for s, e, label in labels: d = (e - s) * 1e-7 if is_full_context: is_silence = "-sil" in label or "-pau" in label else: is_silence = label == "sil" or label == "pau" if is_silence and d > threshold: pass else: sum_d += d return sum_d def segment_labels( labels, strict=True, silence_threshold=0.1, min_duration=5.0, force_split_threshold=5.0, ): """Segment labels based on sil/pau Example: [a b c sil d e f pau g h i sil j k l] -> [a b c] [d e f] [g h i] [j k l] """ seg = hts.HTSLabelFile() start_indices = [] end_indices = [] si = 0 done_last_label = False for idx, (s, e, label) in enumerate(labels): d = (e - s) * 1e-7 is_silence = _is_silence(label) if len(seg) > 0: # Compute duration except for long silences seg_d = compute_nosil_duration(seg) else: seg_d = 0 # let's try to split if (is_silence and d > force_split_threshold) or ( is_silence and d > silence_threshold and seg_d > min_duration ): if idx == len(labels) - 1: pass elif len(seg) > 0: start_indices.append(si) if is_silence and d > force_split_threshold: end_indices.append(idx - 1) start_indices.append(idx) end_indices.append(idx) seg = hts.HTSLabelFile() else: seg.append((s, e, label), strict) end_indices.append(idx) seg = hts.HTSLabelFile() si = idx + 1 elif len(seg) == 0: seg.append((s, e, label), strict) start_indices.append(si) end_indices.append(idx) seg = hts.HTSLabelFile() else: if len(seg) == 0: si = idx if idx == len(labels) - 1: done_last_label = True seg.append((s, e, label), strict) if len(seg) > 0: seg_d = compute_nosil_duration(seg) # If the last segment is short, combine with the previous segment. if seg_d < min_duration and len(end_indices) > 1: end_indices[-1] = si + len(seg) - 1 else: start_indices.append(si) end_indices.append(si + len(seg) - 1) # Handle last label if not done_last_label: s, e, label = labels[-1] d = (e - s) * 1e-7 if _is_silence(label) and d > silence_threshold: start_indices.append(end_indices[-1]) end_indices.append(end_indices[-1]) segments = [] for s, e in zip(start_indices, end_indices): seg = labels[s : e + 1] offset = seg.start_times[0] seg.start_times = np.asarray(seg.start_times) - offset seg.end_times = np.asarray(seg.end_times) - offset segments.append(seg) return segments def _label2phrases_neutrino(labels): """Segment HTS labels to phrases (NETRINO compatible) Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels Returns: list: phrases (i.e., list of HTS labels) """ start_indices = [] end_indices = [] started = True start_indices.append(0) is_sil_phrase = _is_silence(labels.contexts[0]) for idx, (_, _, label) in enumerate(labels): # sil or pau shouldn't be placed right before the br if idx > 0 and _is_br(label): assert not _is_silence(labels.contexts[idx - 1]) # we are in the same phrase group if started: if is_sil_phrase: if _is_silence(label): continue else: if ( not _is_silence(label) and (idx > 0 and not _is_br(labels.contexts[idx - 1])) or (idx == 0 and not _is_silence(label)) ): continue # reached the end of phrase end_indices.append(idx) started = False # start new phrase if not started: started = True is_sil_phrase = _is_silence(label) start_indices.append(idx) # handle last phrase if len(end_indices) == len(start_indices) - 1: end_indices.append(len(labels)) # Make a list of HTS labels phrases = [labels[s:e] for (s, e) in zip(start_indices, end_indices)] return phrases, start_indices, end_indices def fix_label_offset_to_zero(labels): """Fix label offset to zero Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels Returns: nnmnkwii.io.hts.HTSLabelFile: HTS labels with fixed offset """ offset = labels.start_times[0] if offset > 0: labels.start_times = np.asarray(labels.start_times) - offset labels.end_times = np.asarray(labels.end_times) - offset return labels def _label2phoneme_for_phrases(labels, s, e, note_indices=None): if s == e: r = labels.contexts[s] elif note_indices is None: r = " ".join(labels[s:e].contexts) else: rs = [] for i in range(s, e): if i not in [s, e] and i in note_indices: rs.append(",") rs.append(labels.contexts[i]) r = " ".join(rs).replace(" ,", ",") return r def label2phrases_str(labels, note_indices): """Convert labels to NEUTRINO-format phraselist Note that timing labels should be used as input to get the same output as the NEUTRINO. Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels note_indices (list): indices of notes. This is needed to insert ``,`` at note boundaries. Returns: str: NEUTRINO-format phraselist """ _, start_indices, end_indices = _label2phrases_neutrino(labels) out = "" for idx in range(len(end_indices)): s, e = start_indices[idx], end_indices[idx] start_time = int(labels.start_times[s] // 10000) ph = _label2phoneme_for_phrases(labels, s, e, note_indices) is_voiced_phrase = not ("sil" in ph or "pau" in ph) out += f"{idx} {start_time} {int(is_voiced_phrase)} {ph}\n" return out def label2phrases(labels, fix_offset=True): """Convert labels to phrases The definision of a phrase is the same as NEUTRINO. See https://studio-neutrino.com/332/ for details. Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels fix_offset (bool): If True, fix label offset to zero Returns: list: phrases (i.e., list of HTS labels) """ phrases = _label2phrases_neutrino(labels)[0] if fix_offset: phrases = [fix_label_offset_to_zero(p) for p in phrases] return phrases def overwrite_phoneme_flags_(labels, flag): """Overwrite phoneme flags Args: labels (nnmnkwii.io.hts.HTSLabelFile): HTS labels flag (str): phoneme flag to overwrite Returns: nnmnkwii.io.hts.HTSLabelFile: modified HTS labels """ contexts = labels.contexts for i in range(len(contexts)): n = len(_flag_re.findall(contexts[i])) if n == 0: print(i, contexts[i]) print("Warn: it is likely to have a wrong input format. Ignoring.") elif n != 1: print(i, contexts[i]) raise RuntimeError("More than two flags are found") contexts[i] = _flag_re.sub(f"^{flag}_", contexts[i]) labels.contexts = contexts return labels