Source code for nnsvs.pitch

"""This module provides functionality for pitch analysis.


Nakano et al, "An Automatic Singing Skill Evaluation Method
for Unknown Melodies Using Pitch Interval Accuracy and Vibrato Features"
Proc. Interspeech 2006.

山田 et al, "HMM に基づく歌声合成のためのビブラートモデル化"
IPSJ SIG Tech. Report 2009.

Note that vibrato extraction method in this module is exerimental.
Because details of the vibrato extraction method are not described
in the above papers and not trivial to implement (in my opinion),
my implementation may not work well compared to the original author's one.
Also note that there are a lot of tunable parameters (threshold,
window size, min/max extent, cut-off frequency, etc.).
If you want to get maximum performance, you might want to tune these
parameters with your dataset.
I tested this code with kiritan_singing and nit-song070 database.
import librosa
import numpy as np
import torch
from nnsvs.dsp import lowpass_filter
from scipy.signal import argrelmax, argrelmin

_c4_hz = 440 * 2 ** (3 / 12 - 1)
_c4_cent = 4800

[docs] def hz_to_cent_based_c4(hz): """Convert Hz to cent based on C4 Args: hz (np.ndarray): array of Hz Returns: np.ndarray: array of cent """ out = hz.copy() nonzero_indices = np.where(hz > 0)[0] out[nonzero_indices] = ( 1200 * np.log(hz[nonzero_indices] / _c4_hz) / np.log(2) + _c4_cent ) return out
[docs] def cent_to_hz_based_c4(cent): """Convert cent to Hz based on C4 Args: cent (np.ndarray): array of cent Returns: np.ndarray: array of Hz """ out = cent.copy() nonzero_indices = np.where(cent > 0)[0] out[nonzero_indices] = ( np.exp((cent[nonzero_indices] - _c4_cent) * np.log(2) / 1200) * _c4_hz ) return out
[docs] def nonzero_segments(f0): """Find nonzero segments Args: f0 (np.ndarray): array of f0 Returns: list: list of (start, end) """ vuv = f0 > 0 started = False s, e = 0, 0 segments = [] for idx in range(len(f0)): if vuv[idx] > 0 and not started: started = True s = idx elif started and (vuv[idx] <= 0): e = idx started = False segments.append((s, e)) else: pass if started and vuv[-1] > 0: segments.append((s, len(vuv) - 1)) return segments
[docs] def note_segments(lf0_score_denorm): """Compute note segments (start and end indices) from log-F0 Note that unvoiced frames must be set to 0 in advance. Args: lf0_score_denorm (Tensor): (B, T) Returns: list: list of note (start, end) indices """ segments = [] for s, e in nonzero_segments(lf0_score_denorm): out = torch.sign(torch.abs(torch.diff(lf0_score_denorm[s : e + 1]))) transitions = torch.where(out > 0)[0] note_start, note_end = s, -1 for pos in transitions: note_end = int(s + pos) segments.append((note_start, note_end)) note_start = note_end + 1 # Handle last note while ( note_start < len(lf0_score_denorm) - 1 and lf0_score_denorm[note_start] <= 0 ): note_start += 1 note_end = note_start + 1 while note_end < len(lf0_score_denorm) - 1 and lf0_score_denorm[note_end] > 0: note_end += 1 if note_end != note_start + 1: segments.append((note_start, note_end)) return segments
def compute_f0_correction_ratio( f0, f0_score, edges_to_be_excluded=50, out_of_tune_threshold=200, correction_threshold=100, ): """Compute f0 correction ratio Args: f0 (np.ndarray): array of f0 f0_score (np.ndarray): array of f0 score Returns: float: correction ratio to multiplied to F0 (i.e. f0 * ratio) """ segments = note_segments(torch.from_numpy(f0_score)) center_f0s = [] center_score_f0s = [] # edges_to_be_excluded = 50 # 0.25 sec for excluding overshoot/preparation for s, e in segments: L = e - s if L > edges_to_be_excluded * 2: center_f0s.append(f0[s + edges_to_be_excluded : e - edges_to_be_excluded]) center_score_f0s.append( f0_score[s + edges_to_be_excluded : e - edges_to_be_excluded] ) center_f0s = np.concatenate(center_f0s) center_score_f0s = np.concatenate(center_score_f0s) # Compute pitch ratio to be multiplied nonzero_indices = (center_f0s > 0) & (center_score_f0s > 0) ratio = center_score_f0s[nonzero_indices] / center_f0s[nonzero_indices] # Exclude too out-of-tune frames (over 2 semitone) up_threshold = np.exp(out_of_tune_threshold * np.log(2) / 1200) low_threshold = np.exp(-out_of_tune_threshold * np.log(2) / 1200) ratio = ratio[(ratio < up_threshold) & (ratio > low_threshold)] global_offset = ratio.mean() # Avoid corrections over semi-tone # If more than semi-tone pitch correction is needed, it is better to correct # data by hand or fix musicxml or UST instead. up_threshold = np.exp(correction_threshold * np.log(2) / 1200) low_threshold = np.exp(-correction_threshold * np.log(2) / 1200) if global_offset > up_threshold or global_offset < low_threshold: print( f"""warn: more than 1 semitone pitch correction is needed. global_offset: {global_offset} cent. It is likely that manual pitch corrections are preferable.""" ) global_offset = np.clip(global_offset, low_threshold, up_threshold) return global_offset def extract_vibrato_parameters_impl(pitch_seg, sr): """Extract vibrato parameters for a single pitch segment Nakano et al, "An Automatic Singing Skill Evaluation Method for Unknown Melodies Using Pitch Interval Accuracy and Vibrato Features" Proc. Interspeech 2006. 山田 et al, "HMM に基づく歌声合成のためのビブラートモデル化" IPSJ SIG Tech. Report 2009. Args: pitch_seg (np.ndarray): array of pitch sr (int): sampling rate Returns: tuple: (R, E, m_a, m_f) """ peak_high_pos = argrelmax(pitch_seg)[0] peak_low_pos = argrelmin(pitch_seg)[0] m_a = np.zeros(len(pitch_seg)) m_f = np.zeros(len(pitch_seg)) if len(peak_high_pos) != len(peak_low_pos) + 1: print("Warning! Probably a bug...T.T") print(peak_high_pos, peak_low_pos) return None, None, None, None peak_high_pos_diff = np.diff(peak_high_pos) peak_low_pos_diff = np.diff(peak_low_pos) R = np.zeros(len(peak_high_pos_diff) + len(peak_low_pos_diff)) R[0::2] = peak_high_pos_diff R[1::2] = peak_low_pos_diff m_f_ind = np.zeros(len(R), dtype=int) m_f_ind[0::2] = peak_high_pos[:-1] m_f_ind[1::2] = peak_low_pos[:-1] m_f[m_f_ind] = (1 / R) * sr peak_high_pitch = pitch_seg[peak_high_pos] peak_low_pitch = pitch_seg[peak_low_pos] E = np.zeros(len(R)) E[0::2] = (peak_high_pitch[1:] + peak_high_pitch[:-1]) / 2 - peak_low_pitch E[1::2] = peak_high_pitch[1:-1] - (peak_low_pitch[1:] + peak_low_pitch[:-1]) / 2 m_a_ind = np.zeros(len(R), dtype=int) m_a_ind[0::2] = peak_low_pos m_a_ind[1::2] = peak_high_pos[1:-1] m_a[m_a_ind] = 0.5 * E rate = 1 / R.mean() * sr extent = 0.5 * E.mean() print(f"Rate: {rate}, Extent: {extent}") return R, E, m_a, m_f def compute_extent(pitch_seg): """Compute extent of a pitch segment Args: pitch_seg (np.ndarray): array of pitch Returns: np.ndarray: array of extent """ peak_high_pos = argrelmax(pitch_seg)[0] peak_low_pos = argrelmin(pitch_seg)[0] if len(peak_high_pos) == 1 or len(peak_low_pos) == 1: return np.array([-1]) if len(peak_high_pos) < len(peak_low_pos): peak_low_pos = peak_low_pos[:-2] elif len(peak_high_pos) == len(peak_low_pos): peak_low_pos = peak_low_pos[:-1] peak_high_pitch = pitch_seg[peak_high_pos] peak_low_pitch = pitch_seg[peak_low_pos] peak_high_pos_diff = np.diff(peak_high_pos) peak_low_pos_diff = np.diff(peak_low_pos) # TODO: would probably be a bug... if len(peak_high_pitch) != len(peak_low_pitch) + 1: return np.array([-1]) E = np.zeros(len(peak_high_pos_diff) + len(peak_low_pos_diff)) E[0::2] = (peak_high_pitch[1:] + peak_high_pitch[:-1]) / 2 - peak_low_pitch E[1::2] = peak_high_pitch[1:-1] - (peak_low_pitch[1:] + peak_low_pitch[:-1]) / 2 return E def extract_smoothed_f0(f0, sr, cutoff=8): """Extract smoothed f0 by low-pass filtering Note that the low-pass filter is only applied to voiced segments. Args: f0 (np.ndarray): array of f0 sr (int): sampling rate cutoff (float): cutoff frequency Returns: np.ndarray: array of smoothed f0 """ segments = nonzero_segments(f0) f0_smooth = f0.copy() for s, e in segments: f0_smooth[s:e] = lowpass_filter(f0[s:e], sr, cutoff=cutoff) return f0_smooth def extract_smoothed_continuous_f0(f0, sr, cutoff=20): """Extract smoothed continuous f0 by low-pass filtering Note that the input must be continuous F0 or log-F0. Args: f0 (np.ndarray): array of continuous f0 sr (int): sampling rate cutoff (float): initial cutoff frequency Returns: np.ndarray: array of smoothed continuous f0 """ is_2d = len(f0.shape) == 2 f0 = f0.reshape(-1) if is_2d else f0 # Ref: f0_smooth = lowpass_filter(f0, sr, cutoff=cutoff) # Fallback case: shound't happen I believe # NOTE: hard-coded for now next_cutoff = 50 while (f0_smooth < 0).any(): f0_smooth = lowpass_filter(f0, sr, cutoff=next_cutoff) next_cutoff *= 2 f0_smooth = f0_smooth.reshape(len(f0), 1) if is_2d else f0_smooth return f0_smooth
[docs] def extract_vibrato_likelihood( f0_smooth, sr, win_length=32, n_fft=128, min_freq=3, max_freq=8 ): """Extract vibrato likelihood Args: f0_smooth (np.ndarray): array of smoothed f0 sr (int): sampling rate win_length (int): window length n_fft (int): FFT size min_freq (float): minimum frequency of the vibrato max_freq (float): maximum frequency of the vibrato Returns: np.ndarray: array of vibrato likelihood """ # STFT on 1st order diffference of F0 X = np.abs( librosa.stft( np.diff(f0_smooth), hop_length=1, win_length=win_length, n_fft=n_fft, window="hann", ) ) X_norm = X / (X.sum(0) + 1e-7) freq_per_bin = sr / n_fft min_freq_bin = int(min_freq / freq_per_bin) max_freq_bin = int(max_freq / freq_per_bin) # Compute vibrato likelhiood St = np.abs(np.diff(X_norm, axis=0)).sum(0) Ft = X_norm[min_freq_bin:max_freq_bin, :].sum(0) vibrato_likelihood = St * Ft return vibrato_likelihood
def interp_vibrato(m_f): """Interpolate a sequence of vibrato parameter by linear interpolation Args: m_f (np.ndarray): array of vibrato parameter Returns: np.ndarray: array of vibrato parameter """ nonzero_indices = np.where(m_f > 0)[0] nonzero_indices = [0] + list(nonzero_indices) + [len(m_f) - 1] out = np.interp(np.arange(len(m_f)), nonzero_indices, m_f[nonzero_indices]) return out
[docs] def extract_vibrato_parameters( pitch, vibrato_likelihood, sr=100, threshold=0.12, min_cross_count=5, min_extent=30, max_extent=150, interp_params=True, smooth_params=False, smooth_width=15, clip_extent=True, ): """Extract vibrato parameters Args: pitch (np.ndarray): array of pitch (smoothed f0) vibrato_likelihood (np.ndarray): array of vibrato likelihood sr (int): sampling rate threshold (float): threshold of vibrato likelihood min_cross_count (int): minimum number of cross points min_extent (int): minimum extent of vibrato (cent) max_extent (int): maximum extent of vibrato (cent) interp_params (bool): whether to interpolate vibrato parameters smooth_params (bool): whether to smooth vibrato parameters smooth_width (int): width of smoothing window clip_extent (bool): whether to clip extent Returns: tuple: tuple of vibrato parameters """ T = len(vibrato_likelihood) vibrato_flags = np.zeros(T, dtype=int) m_a = np.zeros(T) m_f = np.zeros(T) peak_high_pos = argrelmax(pitch)[0] peak_low_pos = argrelmin(pitch)[0] # iterate over every peak position peak_high_idx = 0 while peak_high_idx < len(peak_high_pos): peak_frame_idx = peak_high_pos[peak_high_idx] found = False if vibrato_likelihood[peak_frame_idx] > threshold: # Initial positions for vibrato section start_index = peak_frame_idx peaks = peak_low_pos[peak_low_pos > peak_frame_idx] if len(peaks) > 0: end_index = peaks[0] else: peak_high_idx += 1 continue next_start_peak_high_idx = -1 # Find a peak position that is close to the next non-speech segment # assuming that there's a non-speech segment right after vibrato # NOTE: we may want to remove this constraint peak_high_pos_rest = peak_high_pos[peak_high_pos > peak_frame_idx] for frame_idx in range(end_index, T): if pitch[frame_idx] <= 0: peaks = peak_high_pos_rest[peak_high_pos_rest < frame_idx] if len(peaks) > 0: end_index = peaks[-1] next_start_peak_high_idx = ( len(peak_high_pos[peak_high_pos < end_index]) + 1 ) break # Set the search width (backward) search_width_backward = 0 for frame_idx in range(start_index, 0, -1): if pitch[frame_idx] <= 0: peaks_backward = peak_high_pos[ (peak_high_pos < peak_frame_idx) & (peak_high_pos > frame_idx) ] if len(peaks_backward) > 0: backward = peaks_backward[0] search_width_backward = len( peak_high_pos[ (peak_high_pos > backward) & (peak_high_pos <= peak_frame_idx) ] ) break # Find a peak position that satisfies the following vibrato constraints # 1) more than 5 times crossing # 2) 30 ~ 150 cent oscillation estimate_start_index = start_index rate = 0 for peak_idx in range( max(peak_high_idx - search_width_backward, 0), peak_high_idx ): if peak_high_pos[peak_idx] >= T: break f0_seg = pitch[peak_high_pos[peak_idx] : end_index] # Check if the segment satisfies vibrato constraints m = f0_seg.mean() cross_count = len(np.where(np.diff(np.sign(f0_seg - m)))[0]) # Find the start_index so that the vibrato section has more than 5 crossing E = compute_extent(f0_seg) extent = 0.5 * E.mean() having_large_deviation = ((0.5 * E) > max_extent * 2).any() if ( cross_count >= min_cross_count and cross_count >= rate and extent >= min_extent and extent <= max_extent and not having_large_deviation and (E > 0).all() ): rate = cross_count estimate_start_index = peak_high_pos[peak_idx] start_index = estimate_start_index if rate >= min_cross_count: R, E, m_a_seg, m_f_seg = extract_vibrato_parameters_impl( pitch[start_index - 1 : end_index + 2], sr ) if m_a_seg is None: found = False break found = True vibrato_flags[start_index:end_index] = 1 if interp_params: m_a_seg = interp_vibrato(m_a_seg) m_f_seg = np.clip(interp_vibrato(m_f_seg), 3, 8) if smooth_params: m_a_seg = np.convolve( m_a_seg, np.ones(smooth_width) / smooth_width, mode="same" ) m_f_seg = np.convolve( m_f_seg, np.ones(smooth_width) / smooth_width, mode="same" ) if clip_extent: m_a_seg = np.clip(m_a_seg, min_extent, max_extent) m_a[start_index:end_index] = m_a_seg[1:-2] m_f[start_index:end_index] = m_f_seg[1:-2] assert next_start_peak_high_idx > peak_high_idx peak_high_idx = next_start_peak_high_idx if not found: peak_high_idx += 1 return vibrato_flags, m_a, m_f
[docs] def gen_sine_vibrato(f0, sr, m_a, m_f, scale=1.0): """Generate F0 with sine-based vibrato Args: f0 (ndarray): fundamental frequency sr (int): sampling rate m_a (ndarray): amplitude of vibrato m_f (ndarray): frequency of vibrato scale (float): scale factor Returns: ndarray: F0 with sine-based vibrato """ f0_gen = f0.copy() voiced_end_indices = np.asarray([e for _, e in nonzero_segments(f0)]) for s, e in nonzero_segments(m_a): # limit vibrato rate to [3, 8] Hz m_f_seg = np.clip(m_f[s:e], 3, 8) # limit vibrato extent to [30, 150] cent m_a_seg = np.clip(m_a[s:e], 30, 150) cent = scale * m_a_seg * np.sin(2 * np.pi / sr * m_f_seg * np.arange(0, e - s)) new_f0 = f0[s:e] * np.exp(cent * np.log(2) / 1200) f0_gen[s:e] = new_f0 # NOTE: this is a hack to avoid discontinuity at the end of vibrato voiced_ends_next_to_vibrato = voiced_end_indices[voiced_end_indices > e] if len(voiced_ends_next_to_vibrato) > 0: voiced_end = voiced_ends_next_to_vibrato[0] f0_gen[s:voiced_end] = lowpass_filter(f0_gen[s:voiced_end], sr, cutoff=12) return f0_gen